Python爬虫学习笔记 3-1:实训项目之58同城

Published On 2020/01/30 Thursday, Singapore

本项目旨在爬取成都二手房源的位置,价格和房型等信息。

本文为Datacatsle Python爬虫(入门+进阶)课程学习笔记。


爬取步骤

  1. 进入分行政区的成都小区页面,获取各行政区的链接
  2. 对各行政区,获得各小区的链接,进入小区详情页面,抓取名字,价格,地址,年份等信息
  3. 获得各小区二手房页面的第一页房源链接。进入详情页抓取价格信息,并在管道中求该小区房价的平均价格
  4. 抓取各小区出租房页面第一页的房源链接,进入详情页抓取名称,价格,房型等信息

代码

  1. 打开命令提示符,输入以下命令,开始一个新的scrapy项目:fang58。
    scrapy startproject fang58
    
  2. 切换到fang58路径下,生成爬虫文件fang,其目标网站是58.com。
    cd fang58
    scrapy genspider fang 58.com
    
  3. 到fang58/fang58/spider路径下,在fang.py中添加代码。 解析页面时,如果通过Google浏览器获取解析路径,可能会出现问题。其原因是:google浏览器可能会添加标签。比如这里获取

     import scrapy
     from pyquery import PyQuery
     from fang58.items import Fang58Item,Fang58ItemShou,Fang58ItemZu
     class FangSpider(scrapy.Spider):
         name='fang'
         allowed_domains=['58.com']
         start_urls=['https://cd.58.com/xiaoqu/']
    
         def parse(self,response):
             jpy=PyQuery(response.text)
             li_s=jpy('body > div.main-wrap > div.content-wrap > div.content-side-left > ul > li')
             for li in li_s.items():
                 url=li('div.list-info > h2 > a').attr('href')
                 yield scrapy.Request(url=url,callback=self.fang_detail)
    
         def fang_detail(self,response): 
             jpy=PyQuery(response.text)
             zai_shou_url=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr.tb-btm > td:nth-child(2) > a').attr('href')
             zai_zu_url=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr.tb-btm > td:nth-child(4) > a').attr('href')
             yield scrapy.Request(url='https:{}'.format(zai_shou_url),callback=self.shou_list)
             yield scrapy.Request(url='https:{}'.format(zai_zu_url),callback=self.zu_list)
             i=Fang58Item()
             i['title']=jpy('body > div.body-wrapper > div.title-bar > span.title').text()
             i['price']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.price-container').text()
             i['di_zhi']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr:nth-child(1) > td:nth-child(3)')
             i['nian_dai']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr:nth-child(5) > td:nth-child(2)')
    
    
         def shou_list(self,response):
         #二手房列表页
             jpy=PyQuer(response.text)
             tr_s=jpy('#infolist > div.listwrap > table > tr')
             for tr in tr_s.items():
                 i=Fang58ItemShou()
                 i['title']=tr('td.t > a.t').text()
                 i['url']=tr('td.t > a.t').attr('href')
                 i['dan_jia']=tr('td.tc > span:nth-child(3)').text()
                 i['zong_jia']=tr('td.tc > b').text()
                 i['mian_ji']=tr('td.tc > span:nth-child(5)').text()
                 i['url_r']=response.url
                 yield i
         def zu_list(self,response):
         #租房列表页
             jpy=PyQuery(response.text)
    
         #获取tr列表
             tr_s=jpy('#infolist > div.listwrap > table > tr')
             i=Fang58ItemZu()
    
             for tr in tr_s.items():
                 i=Fang58ItemZu()
                 i['title']=tr('td.t > a.t').text()
                 i['price']=tr('td.tc > b').text()
                 i['url']=tr('td.t > a.t').attr('href')
                 yield i   
    
    

    到fang58/fang58路径下,在items.py中添加代码.

     import scrapy
     class Fang58Item(scrapy.Item):
         title=scrapy.Field()
         price=scrapy.Field()
         di_zhi=scrapy.Field()
         nian_dai=scrapy.Field()
    
     class Fang58ItemZu(scrapy.Item):
         url=scrapy.Field()
         title=scrapy.Field()
         price=scrapy.Field()
    
     class Fang58ItemShou(scrapy.Item):
         url=scrapy.Field()
         title=scrapy.Field()
         zong_jia=scrapy.Field()
         dan_jia=scrapy.Field()
         mian_ji=scrapy.Field()
         url_r=scrapy.Field()
    
  4. 到fang58/fang58路径下,在pipelines.py中添加代码。它负责数据处理和入库。

     import pymongo
     from fang58.items import Fang58Item,Fang58ItemShou,Fang58ItemZu
    
     class Fang58Pipeline(object):
         def process_item(self,item,spider):
             return item
    
     class MongoPipeline(object):
         collection_name='scrapy_items'
         item_save=dict() 
    
         def  __init__(self, mongo_uri, mongo_db):
             self.mongo_uri=mongo_uri
             self.mongo_db=mongo_db
    
         @classmethod
         def from_crawler(cls, crawler):
             return cls(
                 mongo_uri=crawler.settings.get('MONGODB_URI'),   #提取出了mongodb配置
                 mongo_db=settings.get('MONGODB_DATABASE', 'items')   
             )
    
         def open_spider(self, spider):
             self.client = MongoClient(self.mongo_uri)  #连接数据库
             self.db = self.client[self.mongo_db]
    
         def close_spider(self,spider):
             self.clident.close()
    
    
         def process_item(self,item,spider):
             if isinstance(item,Fang58Item):
             #求平均值
                 item_save=self.item_save.get(item['url_r'],[])
                 if item_save:
                     #不满30个,继续添加,如果满30,计算平均价格
                     if len(item_save)>=29:
                         pass
                     #计算均价
                         item_save.append(item)
                         sum_=0
                         for i in item_save:
                             sum_+=int(i['dan_jia'].replace('元/m²',''))
                         jun_jia=sum_/30
                         print('均价:{}'.format(jun_jia))
                         del self.item_save[item['url_r']]
                     else:
                         item_save.append(item)
                 else:
                     self.item_save[item['url_r']]=[]
             self.db[self.collection_name].insert_one(dict(item))
    
             return item
    
  5. 到fang58/fang58路径下,在middlewares.py中添加代码。它负责挂代理。

     from scrapy import signals
     class ProxyMiddleware(object):
    
         def process_request(self, request, spider):
             #传入代理服务器,下面语句需要替换为自己的代理方式
             request.meta['proxy'] = 'http://{}'.format(proxy_pool.pop())
             print()
    
         def process_response(self, request, response, spider):
             return response
    
    
         def process_exception(self, request, exception, spider):
             pass
    
  6. 最后,到fang58/fang58路径下,在settings.py中添加代码.它负责激活管道和中间件,制定管道顺序

     BOT_NAME = 'fang58'
     SPIDER_MODULES = ['fang58.spiders']
     NEWSPIDER_MODULE = 'fang58.spiders
    
     MONGODB_HOST = '127.0.0.1'   #本地数据库
     MONGODB_PORT = '27017'    #数据库端口
     MONGODB_URI = 'mongodb://{}:{}'.format(MONGODB_HOST, MONGODB_PORT) 
     MONGODB_DATABASE = 'test'  #数据库名字
    





💚 Back to Home