1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
|
import requests import os from lxml import etree
if __name__ == '__main__': if not os.path.exists('./简历模板'): os.mkdir('./简历模板')
headers = { 'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36User - Agent: Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36' } base_url1 = 'https://sc.chinaz.com/jianli/' base_url2 = 'https:'
for page in range(1, 4): if page == 1: add_url1 = 'free.html' else: add_url1 = f'free_{page}.html' url1 = base_url1 + add_url1 page_text1 = requests.get(url=url1, headers=headers).text tree1 = etree.HTML(page_text1) p_list = tree1.xpath('//*[@id="container"]//p') for p in p_list: name = p.xpath('./a/text()')[0].encode('ISO-8859-1').decode('utf-8') + '.rar' add_url2 = p.xpath('./a/@href')[0] url2 = base_url2 + add_url2 page_text2 = requests.get(url=url2, headers=headers).text tree2 = etree.HTML(page_text2) data_url = tree2.xpath('//*[@id="down"]//li[12]/a/@href')[0] data = requests.get(url=data_url, headers=headers).content data_path = './简历模板/' + name with open(data_path, 'wb') as file: file.write(data) print(name+'下载成功!')
print('over!')
|