Python抓取知乎图片

最近在学python,写个简单的爬虫来练练手。

思路

为了爬取的速度,所以目的是写个多线程的爬虫来爬取知乎的图片。

大致思路是模拟用户登录,从首页进入,下载其中的所有图片,并存储首页中的链接,然后再递归的查询存储的链接,下载图片,如此循环。

代码中用到的数据结构:

download_image_queue 队列,用来存储所有的链接。 会有多个下载图片的线程通过竞争从这个队列中pop出链接,然后下载其中的图片

generate_url_queue 队列,用来通过链接来查找链接。从download_image_queue pop出来的链接,除了下载其中的图片外,还需要put到这个队列中,有一个单独的线程来从这个队列中pop出链接,来查找这个链接中的其他所有链接,并存入download_image_queue队列。

queried_set Set集合,用于存储已经查询过的链接,避免陷入死循环

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
import uuid
import queue
import threading
import requests
import time
import os


def get_image_name(href):
href_list = href.split('/')
return href_list.pop()


def rename_image(name):
name_list = name.split('.')
if name_list is not None and name_list.pop() not in image_suffix():
name = str(uuid.uuid4()) + '.jpg'
return name


def image_suffix():
image_suffixes = ['jpg', 'jpeg', 'png', 'gif']
return image_suffixes


def create_dir(download_dir):
if not os.path.exists(download_dir):
os.mkdir(download_dir)


class LearnSpider:
def __init__(self, username, password):
self.queried_set = set()
self.generate_url_queue = queue.Queue()
self.download_image_queue = queue.Queue()
self.session = self.login_zhihu(username, password)

@staticmethod
def login_zhihu(username, password):
data = {
"email": username,
"password": password,
"remember_me": "true",
}
login_url = "https://www.zhihu.com/login/email"
session = requests.session()
response = session.post(url=login_url, data=data)
print(response.json())
return session

def parse_html(self, path):
html = self.session.get(path).text
soup = BeautifulSoup(html, 'html.parser')
return soup

def generate_path(self, path):
soup = self.parse_html(path)
link_list = soup.find_all("a")
for link in link_list:
href_value = link.get('href')
if href_value is not None and href_value.startswith('http'):
if href_value not in self.queried_set:
self.queried_set.add(href_value)
self.download_image_queue.put(href_value)

def download_images(self, path, download_dir):
self.generate_url_queue.put(path)
soup = self.parse_html(path)
a_list = soup.find_all('img')
for link in a_list:
src_value = link.get('src')
if src_value is not None and src_value.startswith('http'):
print(threading.current_thread().name, " downloading ", src_value)
name = rename_image(get_image_name(src_value))
with open("{0}{1}{2}".format(download_dir, os.path.sep, name), 'wb') as outfile:
data = self.session.get(src_value).content
outfile.write(data)

def generate_path_work(self):
while True:
if not self.generate_url_queue.empty():
pop_path = self.generate_url_queue.get()
self.generate_path(pop_path)
else:
time.sleep(1)
print("generate queue is null")

def download_image_work(self, download_path):
while True:
if not self.download_image_queue.empty():
pop_path = self.download_image_queue.get()
self.download_images(pop_path, download_path)
else:
time.sleep(1)
print("download queue is null")

def start_download(self, download_dir, download_thread_count=10):
create_dir(download_dir)
path = "https://www.zhihu.com"
self.download_image_queue.put(path)
generate_thread = threading.Thread(target=self.generate_path_work, name="generate_path_thread")
for i in range(1, download_thread_count + 1):
threading.Thread(target=self.download_image_work, args=(download_dir,), name="thread{0}".format(i)).start()
generate_thread.start()

if __name__ == "__main__":
login_username = "***"
login_password = "***"
spider = LearnSpider(login_username, login_password)
spider.start_download("e:\\down_images", download_thread_count=50)