How to use Request function in a Scrapy Spider?

from string import join
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders.crawl import Rule, CrawlSpider
from scrapy.http.request import Request
from scrapy.selector import HtmlXPathSelector
from Gfire.items import GfireItem

class GuideSpider(CrawlSpider):
    name = "Gfire"
    allowed_domains = ['www.example.com']
    start_urls = [
        "http://www.example.com/gfire/guides"
    ]
    rules = (
        Rule(SgmlLinkExtractor(allow=("gfire/guides.*page=")), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        items = []
        sites = hxs.select('//div[@class="title"]')
        for site in sites:
            item = GFireItem()
            item['title'] = site.select('./a/text()').extract()
            item['guide_url'] = site.select('./a/@href').extract()
            item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
            items.append(item)
        return Request(items[1], callback=self.parse_item2)

    def parse_item2(self, response):
        hxs = HtmlXPathSelector(response)
        hero = hxs.select("//h3/a/text()").extract()
        return hero

Can't get this spider to work. The request function contains items[1] that should be item['guide_url'] but it says me that the parameter has to be str or unicode. How can I corret this error? And how can I pass to the callback function the items list? Via request.meta?

Answers


Your item[1] is actually an instance of GFireItem.

I'm not certain why you are creating these as you only use one (the second site in your list of sites), discarding the rest of the list.

That aside, you need to extract the items[1]['guide_url'] url when creating the Request:

        return Request(items[1]['guide_url'], callback=self.parse_item2)

def parse_item(self, response):
    hxs = HtmlXPathSelector(response)
    items = []
    sites = hxs.select('//div[@class="title"]')
    for site in sites:
        item = GFireItem()
        item['title'] = site.select('./a/text()').extract()
        item['guide_url'] = site.select('./a/@href').extract()
        item['guide_url'] = "http://www.example.com" + join(item['guide_url'])
        items.append(item)
    return Request(items[1]['guide_url'], request.meta={'items':items}, callback=self.parse_item2)

def parse_item2(self, response):
    items = response.meta["items"]
    hxs = HtmlXPathSelector(response)
    hero = hxs.select("//h3/a/text()").extract()
    return hero

Need Your Help

How to get a 2d array to print from 1 to 80

java arrays multidimensional-array

The goal for my program is for a 2d array to print from 1 to 80. I am trying to get it to look sort of like this.

About UNIX Resources Network

Original, collect and organize Developers related documents, information and materials, contains jQuery, Html, CSS, MySQL, .NET, ASP.NET, SQL, objective-c, iPhone, Ruby on Rails, C, SQL Server, Ruby, Arrays, Regex, ASP.NET MVC, WPF, XML, Ajax, DataBase, and so on.