用python爬取链家的租房信息(记录自己的第一个python代码),
里面涉及到的主要的点有:使用代理ip访问;读取网页;翻页等。欢迎交流
代码如下:
import requests
import urllib.request#urllib.request功能的了解
from bs4 import BeautifulSoup#BeautifulSoup功能了解
import bs4
import random
import re##通过函数获取网页信息
def gethtml(url):#用代理IP访问proxy_support = urllib.request.ProxyHandler({
'http':'119.6.144.73:81'})opener = urllib.request.build_opener(proxy_support)opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18363')]urllib.request.install_opener(opener)#读取网页信息#zf = urllib.request.urlopen('https://sh.lianjia.com/zufang/anting/rt200600000001l0/')zf = urllib.request.urlopen(url)#'https://ks.lianjia.com/zufang/kunshan/rt200600000001l0/'html = zf.read()ht = html.decode('utf8')zf.closeSoup = BeautifulSoup(ht,'lxml')return Soup##定义一些循环中会用到的变量
info = []
page = 1
TotalNumber = 0
urlMain = 'https://ks.lianjia.com/zufang/kunshan/'
urlOption = 'rt200600000001l0/'
#一共有多少条结果,防止找到限制条件以外的推荐结果
Number = int(gethtml(urlMain+urlOption).find(class_ = 'content__article').find(class_ = 'content__title').find('span').text)
print('已找到{}套租房'.format(Number))##用while循环去读取每一页的租房信息
while TotalNumber <= Number:print('正在读取第%d页'%page)if page == 1:url = urlMain + urlOptionelse:url = urlMain +'pg{}'.format(page) + urlOptionSoup = gethtml(url)###找到地址,价格,网址在网页中的位置,然后用find筛选出来Soup = Soup.find_all(class_ = 'content__list--item')numberOfThisPage = len(Soup)print('该页有%d条租房信息'%numberOfThisPage)print('')counter = 0for Soup in Soup:#print(Soup)counter+=1#print(counter)Address = Soup.find(class_ = 'content__list--item--des').find_all('a')if Address == []:continueelse:#print(Address)Address_DistrictName = Address[2].textAddress_Location = Address[0].text+','+Address[1].textPrice = Soup.find('em').textWebsite = Soup.find(class_ = 'content__list--item--title').find('a')['href']Website = 'https://sh.lianjia.com'+Websiteinfo.append([Address_DistrictName,Address_Location,Price,Website])###写入表格中fo=open("链家昆山租房信息——全部.csv","w")for row in info:fo.write(",".join(row)+"\n")fo.close()if counter == numberOfThisPage:breakTotalNumber += counterpage += 1
爬取结果如下(一共八百多条信息):