当前位置: 代码迷 >> 综合 >> python3.7,Selenium + webdriver.Chrome 获取腾讯求职信息四
  详细解决方案

python3.7,Selenium + webdriver.Chrome 获取腾讯求职信息四

热度:36   发布时间:2023-12-12 22:19:03.0

代码:

# -*- coding=utf-8 -*-
import os
import threading
import time
import unittest
import xlwt
import xlrd
from xlutils.copy import copy
from selenium import webdriver
from queue import Queue
import warnings
warnings.simplefilter("ignore", ResourceWarning)'''腾讯职位的前 10 位数据, 并写入 excel, 每页有 10 条信息https://careers.tencent.com/search.html?query=co_1&sc=1(https://careers.tencent.com/search.html?query=co_1&index=1&sc=1)https://careers.tencent.com/search.html?query=co_1&index=2&sc=1https://careers.tencent.com/search.html?query=co_1&index=3&sc=1 # 第一页的 URL'''
line_begin_nmb = 1  # 从第二行开始
dir = os.path.abspath('.').split('src')[0]
old_workbook = xlrd.open_workbook(dir + "./13腾讯求职信息模板.xls", formatting_info=True)  # 把里面的格式复制过来
new_workbook = copy(old_workbook)
new_worksheet = new_workbook.get_sheet(0)def style_font_xingkai(style):  # 华文行楷font = xlwt.Font()font.name = u'华文行楷'style.font = fontreturn styledef style_align_center(style):  # 居中对齐alignment = xlwt.Alignment()alignment.horz = xlwt.Alignment.HORZ_CENTERstyle.alignment = alignmentreturn styledef style_align_left(style):    # 左对齐alignment = xlwt.Alignment()alignment.horz = xlwt.Alignment.HORZ_LEFTalignment.wrap = 1style.alignment = alignmentreturn styledef write_xls(data):        # 写入 xls 文件global new_worksheetxls_style = xlwt.XFStyle()xls_style = style_font_xingkai(xls_style)# 大于第 4 列就左对齐,否则居中对齐,默认选上自动换行j = 0global line_begin_nmbfor key in data:if j < 4:xls_style = style_align_center(xls_style)new_worksheet.write(line_begin_nmb, j, data[key], xls_style)else:xls_style = style_align_left(xls_style)new_worksheet.write(line_begin_nmb, j, data[key], xls_style)j += 1line_begin_nmb += 1new_workbook.save(dir + "./13腾讯求职信息.xls")class ThreadParse(threading.Thread):def __init__(self, thread_name, list_queue):super(ThreadParse, self).__init__()self.thread_name = thread_nameself.list_queue = list_queue# self.driver = driverdef run(self):try:time.sleep(5)self.driver = webdriver.Chrome(executable_path="D:\Python_module\chromdriver\chromedriver.exe")self.driver.maximize_window()page_nmb = self.list_queue.get(False)url = "https://careers.tencent.com/search.html?query=co_1&index=" + str(page_nmb) + "&sc=1"print("此时的 URL 是: " + url)self.driver.get(url)time.sleep(2)self.driver.save_screenshot("baidu.png")for i in range(10):self.driver.find_elements_by_class_name("recruit-title")[i].click()time.sleep(2)handle_list = self.driver.window_handlesself.driver.switch_to.window(handle_list[1])work_title = self.driver.find_element_by_class_name("work-title").textprint("工作职称是: " + work_title)work_paper = self.driver.find_element_by_class_name("work-wrapper").find_elements_by_tag_name("span")print(work_paper[0].text)   # 类型print(work_paper[2].text)   # 地方print(work_paper[4].text)   # 职称print(work_paper[6].text)   # 职称2print(work_paper[8].text)   # 发布时间job_content = self.driver.find_elements_by_class_name("work-module")print(job_content[0].text)  # 工作职责print(job_content[1].text)  # 工作要求job_resume = {
    "title": work_title,"type": work_paper[0].text,"place": work_paper[2].text,"job_1": work_paper[4].text + ";" + work_paper[6].text,"publish_time": work_paper[8].text,"responsibility": job_content[0].text,"requirement": job_content[1].text}self.driver.close()self.driver.switch_to.window(handle_list[0])write_xls(job_resume)time.sleep(1)js = "let q = document.documentElement.scrollTop={}".format(i * 200)    # 下滑 200 像素的 JSself.driver.execute_script(js)time.sleep(3)self.driver.quit()time.sleep(5)except Exception as e:print(e)class Tencent(unittest.TestCase):def setUp(self):  # 初始化方法passdef testTencent(self):  # 测试方法 ,test 开头list_queue = Queue(10)for i in range(1, 11):list_queue.put(i, 2)crawl_list = []for i in range(1, 11):thread_name = "Thread-" + str(i)thread = ThreadParse(thread_name, list_queue)thread.start()time.sleep(67)crawl_list.append(thread)time.sleep(5)while not list_queue.empty():print("队列还在进行中: " + str(list_queue.get()))for thread in crawl_list:print("等待线程: " + str(thread.thread_name) + " 的执行")thread.join()time.sleep(10)def tearDown(self):passif __name__ == "__main__":unittest.main()

效果:

在这里插入图片描述

总结: 刚好 100 条信息(第一栏是标题)
在这里插入图片描述

不过,我自己好奇的是,为什么线程等待的消息会出现在最后面:
在这里插入图片描述

  相关解决方案