白癜风怎么治疗 http://www.baidianfeng51.cn/
Crawler.crawl-Spider.start_requests-Engine.open_spider-CallLaterOnce.scheduler-_next_request-SpiderMiddlewareManager.process_start_requests-Scheduler.open-³õʼ»¯ÓÅÏȼ¶¶ÓÁÐ-³õʼ»¯»ùÓÚ´ÅÅ̵ĶÓÁÐ-µ÷ÓÃÖ¸ÎÆ¹ýÂËÆ÷µÄopen·½·¨-Scraper.open_spider-µ÷ÓÃËùÓеÄpipelineµÄopen_spider·¼·¼-Crawler.open_spider-Signals.send_cache_log_deferred-Engine._next_request-Engine._needs_backout-Engine._next_request_from_scheduler-Engine.crawl-Scheduler.enqueue_requestÇëÇóÈë¶Ó-RFPDupeFilter.request_seenÖ¸ÎÆ¹ýÂË-Engine._download-Downloader.fetch-DownloaderMiddlewareManager.download-Downloader._enqueue_request-Downloader._download-DownloadHandlers.download_request-µ÷ÓÃÔÚsettingsÖж¨ÒåµÄDOWNLOAD_HANDLERS_BASEºÍDOWNLOAD_HANDLERSÏÂÔØÆ÷½øÐÐÏÂÔØ-ExecutionEngine._handle_downloader_output#´¦ÀíÏÂÔØ½á¹û-Scraper.enqueue_scrape-Scraper._scrape_next-Scraper._scrape-Scraper._scrape2-Scraper.call_spider-Spider.parse»òÕßÖ¸¶¨µÄ»Øµ÷·½·¨-Scraper.handle_spider_output-Scraper._process_spidermw_output-ItemPipelineManager.process_item-µ÷Óö¨ÒåÔÚsettingsÖж¨ÒåµÄITEM_PIPELINES_BASEºÍITEM_PIPELINESÖеķ½·¨-Scraper._itemproc_finished´¦ÀíÒì³£ºÍ¼ÌÐø´¦Àí-Engine.start
Á÷ת¹ý³ÌÅäºÏ׏ٷ½µÄÊý¾ÝÁ÷תͼ²é¿´±È½ÏºÃ¡£
ÔÚºËÐÄ×é¼þ³õʼ»¯ÀïÃæ½»´úÁËCrawler.crawl·½·¨Öе÷ÓÃÁËspider.start_requests·½·¨À´»ñÈ¡ÖÖ×ÓÁ´½Ó¡£
classCrawler:
defer.inlineCallbacksdefcrawl(self,*args,**kwargs):ifself.crawling:raiseRuntimeError("Crawlingalreadytakingplace")self.crawling=Truetry:#´ÓSpiderloaderÕÒµ½Ïà¹ØµÄÀ࣬²¢ÊµÀý»¯self.spider=self._create_spider(*args,**kwargs)#´´½¨ÒýÇæself.engine=self._create_engine()#µ÷ÓÃSpiderÀàµÄstart_requests·½·¨start_requests=iter(self.spider.start_requests())#Ö´ÐÐÒýÇæµÄopen_spider£¬´«ÈëʵÀýyieldself.engine.open_spider(self.spider,start_requests)yielddefer.maybeDeferred(self.engine.start)exceptException:self.crawling=Falseifself.engineisnotNone:yieldself.engine.close()raise¼ÌÐø¿´Spider.start_requests
classSpider(object_ref):defstart_requests(self):cls=self.__class__ifnotself.start_urlsandhasattr(self,start_url):raiseAttributeError("Crawlingcouldnotstart:start_urlsnotfound""orempty(butfoundstart_urlattributeinstead,""didyoumissans?)")ifmethod_is_overridden(cls,Spider,make_requests_from_url):warnings.warn("Spider.make_requests_from_urlmethodisdeprecated;it""wontbecalledinfutureScrapyreleases.Please""overrideSpider.start_requestsmethodinstead"f"(see{cls.__module__}.{cls.__name__}).",)forurlinself.start_urls:yieldself.make_requests_from_url(url)else:forurlinself.start_urls:yieldRequest(url,dont_filter=True)defmake_requests_from_url(self,url):"""Thismethodisdeprecated."""warnings.warn("Spider.make_requests_from_urlmethodisdeprecated:""itwillberemovedandnotbecalledbythedefault""Spider.start_requestsmethodinfutureScrapyreleases.""PleaseoverrideSpider.start_requestsmethodinstead.")returnRequest(url,dont_filter=True)
Spider.start_requests×îÖÕÊÇ·µ»ØÁËÒ»¸öRequestsʵÀý
RequestsʵÀýclassRequest(object_ref):def__init__(self,url,callback=None,method=GET,headers=None,body=None,cookies=None,meta=None,encoding=utf-8,priority=0,dont_filter=False,errback=None,flags=None,cb_kwargs=None):#±àÂ뷽ʽself._encoding=encoding#thisonehastobesetfirst#ÇëÇó·½·¨self.method=str(method).upper()#URLself._set_url(url)#ÇëÇóÌåself._set_body(body)#ÓÅÏȼ¶ifnotisinstance(priority,int):raiseTypeError(f"Requestprioritynotaninteger:{priority!r}")self.priority=priority#Õý³£»Øµ÷º¯ÊýifcallbackisnotNoneandnotcallable(callback):raiseTypeError(fcallbackmustbeacallable,got{type(callback).__name__})#Òì³£»Øµ÷º¯ÊýiferrbackisnotNoneandnotcallable(errback):raiseTypeError(ferrbackmustbeacallable,got{type(errback).__name__})self.callback=callbackself.errback=errback#cookieheaderself.cookies=cookiesor{}self.headers=Headers(headersor{},encoding=encoding)#ÊÇ·ñÐèÒªÈ¥ÖØself.dont_filter=dont_filter#metaÐÅÏ¢self._meta=dict(meta)ifmetaelseNoneself._cb_kwargs=dict(cb_kwargs)ifcb_kwargselseNoneself.flags=[]ifflagsisNoneelselist(flags)
Requests¶¨ÒåÁËScrapyµÄÇëÇóʵÀý¡£
EngineÈçºÎµ÷¶Èyieldself.engine.open_spider(self.spider,start_requests)
classExecutionEngine:
defer.inlineCallbacksdefopen_spider(self,spider,start_requests=(),close_if_idle=True):ifnotself.has_capacity():raiseRuntimeError(f"Nofreespiderslotwhenopening{spider.name!r}")logger.info("Spideropened",extra={spider:spider})#×¢²á_next_request·½·¨£¬Ñ»·µ÷ÓÃnextcall=CallLaterOnce(self._next_request,spider)#³õʼ»¯schedulerscheduler=self.scheduler_cls.from_crawler(self.crawler)#µ÷ÓÃÅÀ³æÖмä¼þ£¬´¦ÀíÖÖ×ÓÇëÇóstart_requests=yieldself.scraper.spidermw.process_start_requests(start_requests,spider)#·â×°slot¶ÔÏóslot=Slot(start_requests,close_if_idle,nextcall,scheduler)self.slot=slotself.spider=spider#µ÷ÓÃscheduler.openyieldscheduler.open(spider)yieldself.scraper.open_spider(spider)self.crawler.stats.open_spider(spider)yieldself.signals.send_catch_log_deferred(signals.spider_opened,spider=spider)#·¢Æðµ÷ÓÃslot.nextcall.schedule()slot.heartbeat.start(5)CallLaterOnceclassCallLaterOnce:"""Scheduleafunctiontobecalledinthenextreactorloop,butonlyifithasntbeenalreadyscheduledsincethelasttimeitran."""#ÔÚtwistedµÄreactorÖÐÑ»·µ÷¶ÈµÄÒ»¸ö·½·¨def__init__(self,func,*a,**kw):self._func=funcself._a=aself._kw=kwself._call=Nonedefschedule(self,delay=0):#Éϴη¢Æðµ÷¶ÈÍê³É£¬²Å¿ÉÒÔÔٴε÷¶Èfromtwisted.internetimportreactorifself._callisNone:#½«self×¢²áµ½call_laterÖÐself._call=reactor.callLater(delay,self)defcancel(self):ifself._call:self._call.cancel()def__call__(self):#ÉÏÃæ×¢²áµÄÊÇself£¬ËùÒÔ»áÖ´ÐÐ__call__self._call=Nonereturnself._func(*self._a,**self._kw)
½«ÉÏÃæ×¢²áµÄnextcall=CallLaterOnce(self._next_request,spider)ÔÚtwistedµÄreactorÖÐÒì²½Ö´ÐУ¬Ö»ÐèÒªÔٴε÷ÓÃnextcall.schedule()·½·¨¾Í¿ÉÒÔÍê³ÉÔٴε÷Óã¬ÕâÀïµ÷ÓõÄÊÇEngine._next_request¡£
ÔÚÕâÖ®ºó£¬µ÷ÓÃÁËstart_requests=yieldself.scraper.spidermw.process_start_requests(start_requests,spider)ÅÀ³æÖмä¼þµÄSpiderMiddlewareManager.process_start_requests·½·¨¡£
µ÷¶ÈÆ÷classScheduler:defopen(self,spider):self.spider=spider#ʵÀý»¯ÓÅÏȼ¶¶ÓÁÐself.mqs=self._mq()#Èç¹û¶¨ÒåÀàdqdirÔòʵÀý»¯»ùÓÚ´ÅÅ̵ĶÓÁÐself.dqs=self._dq()ifself.dqdirelseNone#µ÷ÓÃÖ¸ÎÆ¹ýÂËÆ÷µÄopen·½·¨returnself.df.open()def_mq(self):"""Createanewpriorityqueueinstance,within-memorystorage"""returncreate_instance(self.pqclass,settings=None,crawler=self.crawler,downstream_queue_cls=self.mqclass,key=)def_dq(self):"""Createanewpriorityqueueinstance,withdiskstorage"""#ʵÀý»¯´ÅÅ̶ÓÁÐstate=self._read_dqs_state(self.dqdir)q=create_instance(self.pqclass,settings=None,crawler=self.crawler,downstream_queue_cls=self.dqclass,key=self.dqdir,startprios=state)ifq:logger.info("Resumingcrawl(%(queuesize)drequestsscheduled)",{queuesize:len(q)},extra={spider:self.spider})returnqdef_read_dqs_state(self,dqdir):path=join(dqdir,active.json)ifnotexists(path):return()withopen(path)asf:returnjson.load(f)Scraper
classScraper:
defer.inlineCallbacksdefopen_spider(self,spider):"""Openthegivenspiderforscrapingandallocateresourcesforit"""#´¦ÀíµÄ×î´ó²¢·¢Á¿self.slot=Slot(self.crawler.settings.getint(SCRAPER_SLOT_MAX_ACTIVE_SIZE))#µ÷ÓÃËùÓÐpipelineµÄopen_spider·½·¨yieldself.itemproc.open_spider(spider)Engie._next_requestµ÷¶ÈclassExecutionEngine:def_next_request(self,spider):#´Ë·½·¨»áÑ»·µ÷¶Èslot=self.slotifnotslot:return#ÔÝÍ£ifself.paused:return#ÊÇ·ñµÈ´ýwhilenotself._needs_backout(spider):#´ÓschedulerÖлñÈ¡request#×¢Ò⣬µÚÒ»´Î»ñÈ¡ÊÇûÓе쬻ábreak³öÀ´#´Ó¶øÖ´ÐÐÏÂÃæµÄÂß¼ifnotself._next_request_from_scheduler(spider):break#Èç¹ûstart_requestsÓÐÊý¾ÝÇÒ²»ÐèÒªµÈ´ýifslot.start_requestsandnotself._needs_backout(spider):try:#»ñÈ¡ÏÂÒ»¸öÖÖ×ÓÇëÇórequest=next(slot.start_requests)exceptStopIteration:slot.start_requests=NoneexceptException:slot.start_requests=Nonelogger.error(Errorwhileobtainingstartrequests,exc_info=True,extra={spider:spider})else:#µ÷ÓÃcrawl£¬°Ñrequest·ÅÈëschedulerµÄ¶ÓÁÐÖÐself.crawl(request,spider)#¿ÕÏÐÔò¹Ø±Õspiderifself.spider_is_idle(spider)andslot.close_if_idle:self._spider_idle(spider)def_needs_backout(self,spider):#ÊÇ·ñµÈ´ý£¬ÐèÒªÂú×ãËĸöÌõ¼þ#1.EngineÊÇ·ñstop#2.slotÊÇ·ñclose#3.downloaderÏÂÔØ³¬¹ýÔ¤Éè#4.scraper´¦Àíresponse³¬¹ýÔ¤Éèslot=self.slotreturn(notself.runningorslot.closingorself.downloader.needs_backout()orself.scraper.slot.needs_backout())def_next_request_from_scheduler(self,spider):slot=self.slot#´ÓschedulerÄóöÏÂÒ»¸örequestrequest=slot.scheduler.next_request()ifnotrequest:return#ÏÂÔØd=self._download(request,spider)#×¢²á³É¹¦¡¢Ê§°Ü¡¢³ö¿Ú»Øµ÷·½·¨d.addBoth(self._handle_downloader_output,request,spider)d.addErrback(lambdaf:logger.info(Errorwhilehandlingdownloaderoutput,exc_info=failure_to_exc_info(f),extra={spider:spider}))d.addBoth(lambda_:slot.remove_request(request))d.addErrback(lambdaf:logger.info(Errorwhileremovingrequestfromslot,exc_info=failure_to_exc_info(f),extra={spider:spider}))d.addBoth(lambda_:slot.nextcall.schedule())d.addErrback(lambdaf:logger.info(Errorwhileschedulingnewrequest,exc_info=failure_to_exc_info(f),extra={spider:spider}))returndEngine.crawlÇëÇóÈë¶Ó
classEngine:defcrawl(self,request,spider):ifspidernotinself.open_spiders:raiseRuntimeError(f"Spider{spider.name!r}notopenedwhencrawling:{request}")#request·ÅÈëscheduler¶ÓÁУ¬µ÷ÓÃnextcallµÄscheduleself.schedule(request,spider)#½øÐÐÏÂÒ»´Îµ÷¶Èself.slot.nextcall.schedule()defschedule(self,request,spider):self.signals.send_catch_log(signals.request_scheduled,request=request,spider=spider)#µ÷ÓÃschedulerµÄenqueue_request£¬°Ñrequest·ÅÈëscheduler¶ÓÁÐifnotself.slot.scheduler.enqueue_request(request):self.signals.send_catch_log(signals.request_dropped,request=request,spider=spider)Scheduler.enqueue_request
classScheduler:defenqueue_request(self,request):#ÇëÇóÈë¶Ó£¬Èç¹û²»ÏëÑéÖ¤£¬Ôòrequest.dont_filter=TrueÈôÇëÇó¹ýÂËÆ÷ÑéÖ¤ÖØ¸´£¬Ôò·µ»Øfalseifnotrequest.dont_filterandself.df.request_seen(request):self.df.log(request,self.spider)returnFalse#´ÅÅ̶ÓÁÐÊÇ·ñÈë¶Ó³É¹¦dqok=self._dqpush(request)ifdqok:self.stats.inc_value(scheduler/enqueued/disk,spider=self.spider)else:#ûÓдÅÅ̶ÓÁУ¬ÔòʹÓÃÄÚ´æ¶ÓÁÐself._mqpush(request)self.stats.inc_value(scheduler/enqueued/memory,spider=self.spider)self.stats.inc_value(scheduler/enqueued,spider=self.spider)returnTrue
Ö¸ÎÆÉú³É£¬Ã»É¶ËµµÄ¡£
classRFPDupeFilter(BaseDupeFilter):defrequest_seen(self,request):#Éú³ÉÇëÇóÖ¸ÎÆfp=self.request_fingerprint(request)#ÇëÇóÖ¸ÎÆÈç¹ûÔÚ¼¯ºÏÖУ¬ÔòÈÏÎªÖØ¸´iffpinself.fingerprints:returnTrue#¼Ç¼²»Öظ´µÄÖ¸ÎÆself.fingerprints.add(fp)#Èç¹ûÓÐpathÔò½«Ö¸ÎÆÐ´ÈëÎļþÖÐifself.file:self.file.write(fp+\n)defrequest_fingerprint(self,request):#µ÷ÓÃutils.requestµÄrequest_fingerprint·½·¨returnrequest_fingerprint(request)
defrequest_fingerprint(request,include_headers=None,keep_fragments=False):"""Éú³ÉÇëÇóÖ¸ÎÆ"""#Ö¸ÎÆÖÐÊÇ·ñ°üº¬headersifinclude_headers:include_headers=tuple(to_bytes(h.lower())forhinsorted(include_headers))cache=_fingerprint_cache.setdefault(request,{})cache_key=(include_headers,keep_fragments)ifcache_keynotincache:#ʹÓÃsha1·½·¨Éú³ÉÖ¸ÎÆfp=hashlib.sha1()fp.update(to_bytes(request.method))fp.update(to_bytes(canonicalize_url(request.url,keep_fragments=keep_fragments)))fp.update(request.bodyorb)ifinclude_headers:forhdrininclude_headers:ifhdrinrequest.headers:fp.update(hdr)forvinrequest.headers.getlist(hdr):fp.update(v)cache[cache_key]=fp.hexdigest()returncache[cache_key]Engine._downloadÏÂÔØÇëÇó
classEngine:def_download(self,request,spider):#ÏÂÔØslot=self.slotslot.add_request(request)def_on_success(response):#³É¹¦»Øµ÷£¬½á¹û±ØÐëÊÇRequest»òResponseifnotisinstance(response,(Response,Request)):raiseTypeError("Incorrecttype:expectedResponseorRequest,got"f"{type(response)}:{response!r}")#Èç¹û½á¹ûΪResponseÔò·µ»ØResponseifisinstance(response,Response):ifresponse.requestisNone:response.request=requestlogkws=self.logformatter.crawled(response.request,response,spider)iflogkwsisnotNone:logger.log(*logformatter_adapter(logkws),extra={spider:spider})self.signals.send_catch_log(signal=signals.response_received,response=response,request=response.request,spider=spider,)returnresponsedef_on_