Python爬虫学习笔记 3-1:实训项目之58同城
Published On 2020/01/30 Thursday, Singapore
本项目旨在爬取成都二手房源的位置,价格和房型等信息。
本文为Datacatsle Python爬虫(入门+进阶)课程学习笔记。
爬取步骤
- 进入分行政区的成都小区页面,获取各行政区的链接
- 对各行政区,获得各小区的链接,进入小区详情页面,抓取名字,价格,地址,年份等信息
- 获得各小区二手房页面的第一页房源链接。进入详情页抓取价格信息,并在管道中求该小区房价的平均价格
- 抓取各小区出租房页面第一页的房源链接,进入详情页抓取名称,价格,房型等信息
代码
- 打开命令提示符,输入以下命令,开始一个新的scrapy项目:fang58。
scrapy startproject fang58
- 切换到fang58路径下,生成爬虫文件fang,其目标网站是58.com。
cd fang58 scrapy genspider fang 58.com
-
到fang58/fang58/spider路径下,在fang.py中添加代码。 解析页面时,如果通过Google浏览器获取解析路径,可能会出现问题。其原因是:google浏览器可能会添加标签。比如这里获取
import scrapy from pyquery import PyQuery from fang58.items import Fang58Item,Fang58ItemShou,Fang58ItemZu class FangSpider(scrapy.Spider): name='fang' allowed_domains=['58.com'] start_urls=['https://cd.58.com/xiaoqu/'] def parse(self,response): jpy=PyQuery(response.text) li_s=jpy('body > div.main-wrap > div.content-wrap > div.content-side-left > ul > li') for li in li_s.items(): url=li('div.list-info > h2 > a').attr('href') yield scrapy.Request(url=url,callback=self.fang_detail) def fang_detail(self,response): jpy=PyQuery(response.text) zai_shou_url=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr.tb-btm > td:nth-child(2) > a').attr('href') zai_zu_url=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr.tb-btm > td:nth-child(4) > a').attr('href') yield scrapy.Request(url='https:{}'.format(zai_shou_url),callback=self.shou_list) yield scrapy.Request(url='https:{}'.format(zai_zu_url),callback=self.zu_list) i=Fang58Item() i['title']=jpy('body > div.body-wrapper > div.title-bar > span.title').text() i['price']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.price-container').text() i['di_zhi']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr:nth-child(1) > td:nth-child(3)') i['nian_dai']=jpy('body > div.body-wrapper > div.basic-container > div.info-container > div.info-tb-container > table > tr:nth-child(5) > td:nth-child(2)') def shou_list(self,response): #二手房列表页 jpy=PyQuer(response.text) tr_s=jpy('#infolist > div.listwrap > table > tr') for tr in tr_s.items(): i=Fang58ItemShou() i['title']=tr('td.t > a.t').text() i['url']=tr('td.t > a.t').attr('href') i['dan_jia']=tr('td.tc > span:nth-child(3)').text() i['zong_jia']=tr('td.tc > b').text() i['mian_ji']=tr('td.tc > span:nth-child(5)').text() i['url_r']=response.url yield i def zu_list(self,response): #租房列表页 jpy=PyQuery(response.text) #获取tr列表 tr_s=jpy('#infolist > div.listwrap > table > tr') i=Fang58ItemZu() for tr in tr_s.items(): i=Fang58ItemZu() i['title']=tr('td.t > a.t').text() i['price']=tr('td.tc > b').text() i['url']=tr('td.t > a.t').attr('href') yield i
到fang58/fang58路径下,在items.py中添加代码.
import scrapy class Fang58Item(scrapy.Item): title=scrapy.Field() price=scrapy.Field() di_zhi=scrapy.Field() nian_dai=scrapy.Field() class Fang58ItemZu(scrapy.Item): url=scrapy.Field() title=scrapy.Field() price=scrapy.Field() class Fang58ItemShou(scrapy.Item): url=scrapy.Field() title=scrapy.Field() zong_jia=scrapy.Field() dan_jia=scrapy.Field() mian_ji=scrapy.Field() url_r=scrapy.Field()
-
到fang58/fang58路径下,在pipelines.py中添加代码。它负责数据处理和入库。
import pymongo from fang58.items import Fang58Item,Fang58ItemShou,Fang58ItemZu class Fang58Pipeline(object): def process_item(self,item,spider): return item class MongoPipeline(object): collection_name='scrapy_items' item_save=dict() def __init__(self, mongo_uri, mongo_db): self.mongo_uri=mongo_uri self.mongo_db=mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGODB_URI'), #提取出了mongodb配置 mongo_db=settings.get('MONGODB_DATABASE', 'items') ) def open_spider(self, spider): self.client = MongoClient(self.mongo_uri) #连接数据库 self.db = self.client[self.mongo_db] def close_spider(self,spider): self.clident.close() def process_item(self,item,spider): if isinstance(item,Fang58Item): #求平均值 item_save=self.item_save.get(item['url_r'],[]) if item_save: #不满30个,继续添加,如果满30,计算平均价格 if len(item_save)>=29: pass #计算均价 item_save.append(item) sum_=0 for i in item_save: sum_+=int(i['dan_jia'].replace('元/m²','')) jun_jia=sum_/30 print('均价:{}'.format(jun_jia)) del self.item_save[item['url_r']] else: item_save.append(item) else: self.item_save[item['url_r']]=[] self.db[self.collection_name].insert_one(dict(item)) return item
-
到fang58/fang58路径下,在middlewares.py中添加代码。它负责挂代理。
from scrapy import signals class ProxyMiddleware(object): def process_request(self, request, spider): #传入代理服务器,下面语句需要替换为自己的代理方式 request.meta['proxy'] = 'http://{}'.format(proxy_pool.pop()) print() def process_response(self, request, response, spider): return response def process_exception(self, request, exception, spider): pass
-
最后,到fang58/fang58路径下,在settings.py中添加代码.它负责激活管道和中间件,制定管道顺序
BOT_NAME = 'fang58' SPIDER_MODULES = ['fang58.spiders'] NEWSPIDER_MODULE = 'fang58.spiders MONGODB_HOST = '127.0.0.1' #本地数据库 MONGODB_PORT = '27017' #数据库端口 MONGODB_URI = 'mongodb://{}:{}'.format(MONGODB_HOST, MONGODB_PORT) MONGODB_DATABASE = 'test' #数据库名字