本文共 2211 字,大约阅读时间需要 7 分钟。
# -*- coding: UTF-8 -*-from bs4 import BeautifulSoupimport urllib.requestimport requestsimport timeimport jsonimport sysimport reimport os#爬取目标网站urlCRAWL_TARGET_URL = 'https://cn.bing.com/images/async?q=%s&first=%d&count=%d&relp=%d&lostate=r&mmasync=1'#每次抓取图片数量(35是此网页每次翻页请求数量)NUMS_PER_CRAWL = 35#抓取图片最小大小(单位字节),小于此值抛弃MIN_IMAGE_SIZE = 10def get_image(url, path, count): try: u = urllib.request.urlopen(url, timeout=5) t = u.read() if sys.getsizeof(t) < MIN_IMAGE_SIZE: return -1 except Exception as e: print(url, e) return -2 #提取图片格式 frmt = url[url.rfind('.'):] p = re.compile("^\\.[a-zA-Z]+") m = p.match(frmt) frmt = m.group(0) try: if not os.path.exists(path): os.mkdir(path) f = open(os.path.join(path, str(count)+frmt), 'wb') f.write(t) f.close() except Exception as e: print(os.path.join(path, str(count)+frmt), e) return -3 return 0def crawl_data(info, path, num): first = 0 count = 0 #创建一个会话 s = requests.Session() while(count < num): u = CRAWL_TARGET_URL%(info, first, NUMS_PER_CRAWL, NUMS_PER_CRAWL) #3.05s为发送超时时间,10s为接收到数据超时时间 req = s.get(url =u, timeout=(3.05, 10)) bf = BeautifulSoup(req.text, "html.parser") imgtags = bf.find_all("a", class_ = "iusc") for e in imgtags: if count == num: return urldict = json.loads(e.get('m')) if get_image(urldict["murl"], path, count) < 0: continue print("第%d张图片下载完成,总进度%d%%"%(count+1, (count+1)*100/num)) sys.stdout.flush() count =count+1 time.sleep(0.01) first = first + NUMS_PER_CRAWL time.sleep(0.1)if __name__ == '__main__': tstart = time.time() key_words = ['行李','衣服'] for k in range(len(key_words)): if os.path.exists('./' + key_words[k])==False: os.makedirs('./' + key_words[k]) path = './' + key_words[k] + '/' picture_num = 1000 crawl_data(key_words[k], path, picture_num) print("所有图片下载完毕,总用时%.2fs"%(time.time()-tstart))
此代码为网上所找,不记得网址了。原作看见请告知。
转载地址:http://vuhws.baihongyu.com/