零基础爬虫入门

环境准备:

  • python3
  • pycharm

python篇

HelloWorld

1
print('hello,world!')

数据类型

1
2
3
4
5
6
7
8
9
10
print(123)
print(12.34)
print('A')
print('中')
print('hello')
print("hello")
print('''Dear:
LiHua''')
lst = [1, 2, 3, "hello", "world", 'A', '中']
print(lst)

条件判断和循环

1
2
3
4
5
6
7
8
9
10
11
12
13
while 1:
a = int(input("请输入数字:"))
if a == 0:
print('over')
break
elif a == 1:
print('hello')
elif a == 2:
print('world')
elif a == 3:
print('python')
else:
print('java')
1
2
3
4
lst = []
for i in range(20):
lst.append(i)
print(lst)
1
2
3
lst = [1, 2, 666, 'hello', 'rick']
for item in lst:
print(item)

简易爬虫篇

Request

1
2
3
4
5
6
7
8
9
10
11
import requests


if __name__ == '__main__':
# 1.指定url
url = 'https://www.sogou.com/'
# 2.发起请求
response = requests.get(url=url) # 调用get方法,get方法会返回一个响应对象,我们用response来接受这个对象
# 3.获取响应数据
page_text = response.text # text属性,HTTP响应内容的字符串形式,即:url对应的页面内容
print(page_text)

Xpath

安装:pip install lxml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# -*- coding: utf-8 -*-

import requests
from lxml import etree


if __name__ == '__main__':
# 1.指定url
url = 'https://www.sogou.com/'
# 2.发起请求
response = requests.get(url=url) # 调用get方法,get方法会返回一个响应对象,我们用response来接受这个对象
# 3.获取响应数据
page_text = response.text # text属性,HTTP响应内容的字符串形式,即:url对应的页面内容
# 实例化etree对象
tree = etree.HTML(page_text)

# 获取标签
s = tree.xpath('//*[@id="weixinch"]')
s_text = tree.xpath('//*[@id="weixinch"]/text()')
print(s)
print(s_text)

# 获取属性
href = tree.xpath('//*[@id="weixinch"]/@href')
print(href)

4K图片爬取

目标网站

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-

import requests
from lxml import etree
import os


if __name__ == '__main__':
# 创建一个文件夹用来存放图片
if not os.path.exists('./4k_img'):
os.mkdir('./4k_img')

base_url = 'http://pic.netbian.com/4kmeinv/'
img_base_url = 'http://pic.netbian.com/' # 图片
# UA伪装
headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36User - Agent: Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36'
}
for i in range(1, 4): # 爬取前三页图片
if i == 1:
add_url = 'index.html'
else:
add_url = f'index_{i}.html'
url = base_url + add_url
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
# 我们有了图片的base_url,接下来就要想办法获得图片名成以及图片的add_url
li_list = tree.xpath('//*[@id="main"]//li') # 得到了li标签的列表
for li in li_list:
# 解决中文为乱码的方法
img_name = li.xpath('.//a/b/text()')[0].encode('ISO-8859-1').decode('gbk')+'.jpg'
img_add_url = li.xpath('.//img/@src')[0]
img_url = img_base_url + img_add_url
# 这样就获得了每张图片的url和名字
img_data = requests.get(url=img_url, headers=headers).content # 将图片以二进制形式存储到img_data中\
img_path = './4k_img/'+img_name # 每张图片最终存储路径
with open(img_path, 'wb') as file:
file.write(img_data)
print(img_name+'下载成功!')
print('over!')

简历模板爬取

目标网站

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-

import requests
import os
from lxml import etree


if __name__ == '__main__':
# 创建一个文件夹用来存放简历模板
if not os.path.exists('./简历模板'):
os.mkdir('./简历模板')

headers = {
'User-Agent': 'Mozilla / 5.0(Windows NT 10.0; Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36User - Agent: Mozilla / 5.0(WindowsNT10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.75Safari / 537.36'
}
base_url1 = 'https://sc.chinaz.com/jianli/'
base_url2 = 'https:'

for page in range(1, 4): # 爬取前三页
if page == 1:
add_url1 = 'free.html'
else:
add_url1 = f'free_{page}.html'
url1 = base_url1 + add_url1 # 获取到了第一层url
# 要从第一次url中解析到模板名字和add_url2
page_text1 = requests.get(url=url1, headers=headers).text
tree1 = etree.HTML(page_text1)
p_list = tree1.xpath('//*[@id="container"]//p')
# 从p标签中解析出模板名字和add_url2
for p in p_list:
name = p.xpath('./a/text()')[0].encode('ISO-8859-1').decode('utf-8') + '.rar'
add_url2 = p.xpath('./a/@href')[0]
url2 = base_url2 + add_url2 # 获取到了url2
# 从url2中解析出模板二进制数据所在的链接
page_text2 = requests.get(url=url2, headers=headers).text
tree2 = etree.HTML(page_text2)
data_url = tree2.xpath('//*[@id="down"]//li[12]/a/@href')[0]
data = requests.get(url=data_url, headers=headers).content

# 存储模板
data_path = './简历模板/' + name
with open(data_path, 'wb') as file:
file.write(data)
print(name+'下载成功!')

print('over!')