应爬取新数据的急迫需求,学习和整理如下爬虫程序供后续学习和使用!修改各个路径和URL后本代码就可以正常执行了。
主要步骤:
1.西刺网上爬取IP数据;
2.检验爬取的IP的有效性;
3.将有效IP封装在List数据结构中构造成一个IP池,每次爬取数据时,随机从IP池中选取一个IP来做代理使用,防止自己电脑被反爬虫和谐掉!
4.传入有效的目标网站的URL即可访问数据。
#爬取可用公网IP构建IP池,每次使用后随机切换IP反爬虫:现用现爬取,爬取新的然后去使用掉.#python2.7/3.x下open()形式均可以使用:针对西刺网from bs4 import BeautifulSoupfrom urllib import requestimport randomimport urllibimport requestsimport timeimport urllib.request import urllib.parse import time from multiprocessing import Pool#多进程import randomfrom lxml import etree #解析import pandas as pddef IP_pool(path): s = requests.session() header = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36" } ip_list_all = [] for i in range(1,20): #观察到网页的URL最后一位表示网页的页号,实现翻页爬取功能,好像20页可以正常爬取,再多就不行了。 rs = s.get(url="http://www.xicidaili.com/nn/"+str(i), headers=header) #封装成浏览器去访问 soup = BeautifulSoup(rs.text, "lxml") ip_list = soup.select_one("#ip_list").select("tr") ip_info_list_key = ["ip", "port", "address", "hidden", "type", "speed", "conn_time", "survival_time", "verify_time"] for item in ip_list[1:]: ip_info_list_value = [] ip_info = item.select("td") for info in ip_info[1:]: if info.select_one(".bar"): ip_info_list_value.append(info.select_one(".bar")["title"]) else: ip_info_list_value.append(info.get_text().strip()) ip_list_all.append(dict(zip(ip_info_list_key, ip_info_list_value))) print(len(ip_list_all)) # 将爬取到的IP都写text文件 with open(path,'w+') as ws: ws.write("0") ws.write('t') ws.write("序号") # 在1行1列写入 ws.write('t') ws.write("IP地址") ws.write('t') ws.write("端口") ws.write('t') ws.write("服务器地址") ws.write('t') ws.write("是否匿名") ws.write('t') ws.write("类型") ws.write('t') ws.write("速度") ws.write('t') ws.write("连接时间") ws.write('t') ws.write("存活时间") ws.write('t') ws.write("验证时间") ws.write('') i = 0 for item in ip_list_all: i += 1 ws.write(str(i)) # 在i+1行1列写入 ws.write('t') ws.write(item["ip"]) ws.write('t') ws.write(item["port"]) ws.write('t') ws.write(item["address"]) ws.write('t') ws.write(item["hidden"]) ws.write('t') ws.write(item["type"]) ws.write('t') ws.write(item["speed"]) ws.write('t') ws.write(item["conn_time"]) ws.write('t') ws.write(item["survival_time"]) ws.write('t') ws.write(item["verify_time"]) ws.write('') print("写text文本完成!!!") #爬取了1900行IP_path='E:/西刺免费代理IP.txt'#IP_pool(IP_path) #爬取有效的公共IP存入list作为公共的IP池,需要新的IP池就去掉注释即可。def Jiexi_IPtext(IP_path): ipfile=open(IP_path,'r') content=ipfile.readlines() ipfile.close() #str(content[:3]) #查看文本的NLP的CT切片='/t'来分隔的 #1.抽取出来所有爬取的Proxy_header Proxy_header=[['http_https类型','IP地址:端口']] import re for line in content: zhou=line.strip().split('t') shou=re.search('[0-9]*',zhou[8]).group(0) #找到字符串中的天数 #判断一下验证天数,选取大于10天的 if '天' in zhou[8] and int(shou)10: Proxy_header.append([zhou[5],zhou[1]+':'+zhou[2]]) del Proxy_header[:2] #删除前面两个不符合的数据 return Proxy_headerProxy_header=Jiexi_IPtext(IP_path)#2.准备好需要爬取的基因ID编号的List列表genID_path='E:/gene_4.0.txt'def return_GeneId(genID_path): with open(genID_path,'r') as fp: content=fp.readlines() geneID_List=[] for line in content: zhou=[] zhou=line.strip().split('t') if 'D' in zhou[-1]: #只筛选出来含有'D'开头的药物ID号 geneID_List.append(zhou[-1]) return geneID_ListgeneID_List=return_GeneId(genID_path)#li=list(set(geneID_List)) #7773def GetUserAgent(): ''' 功能:随机获取HTTP_User_Agent ''' user_agents=[ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] user_agent = random.choice(user_agents) return user_agentdef testProxy(Proxy_header): ''' 功能:验证IP有效性 @curr_ip:当前被验证的IP ''' head1=Proxy_header[0].lower() curr_ip=Proxy_header[1] tmp_proxies = [] tarURL='https://www.baidu.com' user_agent = GetUserAgent() proxy_support = urllib.request.ProxyHandler({head1:curr_ip}) opener = urllib.request.build_opener(proxy_support) opener.addheaders=[("User-Agent",user_agent)] urllib.request.install_opener(opener) try: res = urllib.request.urlopen(tarURL,timeout=5).read() if len(res)!=0: tmp_proxies.append(curr_ip) except urllib.error.URLError as er2: pass # if hasattr(er2,"code"): # print("验证代理IP("+curr_ip+")时发生错误(错误代码):"+str(er2.code)) # if hasattr(er2,"reason"): # print("验证代理IP("+curr_ip+")时发生错误(错误原因):"+str(er2.reason)) except Exception as er: pass # print("验证代理IP("+curr_ip+")时发生如下错误):") # print(er) #time.sleep(2) return tmp_proxies#可用IP列表valid_IP=[]for i in range(len(Proxy_header)): tmp_proxies=testProxy(Proxy_header[i]) if len(tmp_proxies)!=0: valid_IP.append(Proxy_header[i])import pandas as pddf=pd.DataFrame(valid_IP,columns=['type','IP'])df.to_csv('E:/valid_IP.csv',index=False,header=True)df=pd.read_csv('E:/valid_IP.csv')valid_IP1=[]for i in range(len(df)): zhou=df.loc[i,:].tolist() #取出DataFrame中的一行数据 valid_IP1.append(zhou)def scrapy_web(savepath,Proxy_header,geneID_List): import requests domain='http://rest.genome.jp/subcomp/' header={"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Encoding": "gb2312, utf-8", "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:64.0) Gecko/20100101 Firefox/64.0", "Connection": "keep-alive", "referer":"rest.genome"} #使用"查看元素网络http状态头部为200的消息头原始头" k = 0 #IP爬死掉一个再换,IP池中的IP还能用多少次未知,随机选择不靠谱。用完一个算一个。HTTP的可以:1,22,13,新版本的2 for i in range(4214,len(geneID_List)): # 1675开始, 目前爬取到了2991,爬虫死了这样来重新继续爬取任务 out_path = savepath + geneID_List[i] + '.txt' with open(out_path, 'w') as fp: #给定文件路径后open会自动创建文件 try: #if k % 500 == 0: #IP_item = Proxy_header[k] # 从List中逐次选出来一个数字 IP_item = random.choice(Proxy_header) #从list中随机选择一个数据 #IP_item=['https','59.32.37.67:8010'] # if k==0: # pass # else: # time.sleep(60) # 延时从2s增加到5s显著的减少了IP的浪费 new_url=domain+geneID_List[i]+'/combound/cutoff=0.01/limit=4000/mode=all/charge=on/coordinate=on/valence=on/chiral=on' proxies={IP_item[0].lower(): IP_item[1]} res=requests.get(new_url,proxies=proxies, headers=header,timeout=12) #封装了浏览器头部,同时使用IP池技术做一定程度的反爬虫,timeout设置长一些可以容错:请求时间较长的数据可以爬取下来。 res.encoding='utf-8' #设置页面的编码方式 fp.write(res.text) #k += 1 print('第%d爬取成功!!!' % i) print('有效的IP: ',IP_item[0].lower(),IP_item[1]) except: k=0 while len(res.text)==0: #IP_item = Proxy_header[k] IP_item = random.choice(Proxy_header) #IP_item = ['https', '59.32.37.67:8010'] new_url = domain + geneID_List[i] + '/combound/cutoff=0.01/limit=4000/mode=all/charge=on/coordinate=on/valence=on/chiral=on' proxies = {IP_item[0].lower(): IP_item[1]} res = requests.get(new_url, proxies=proxies, headers=header, timeout=12) # 封装了浏览器头部,同时使用IP池技术做一定程度的反爬虫 res.encoding = 'utf-8' # 设置页面的编码方式 fp.write(res.text) time.sleep(15) print('#######第%d没有爬取成功####!!!' % i) k+=1 if k==10: break #循环10次都找不到,说明是geneID_List[i]不存在,跳出循环 continue print('===========================================') print('我爬完了!!!')if __name__ == '__main__': save_path='D:/Webscrapy/shou/' del Valid_IP[0] #删除第一个是重定向到有道 scrapy_web(save_path,Valid_IP1,geneID_List)附:
1.爬取的西刺网IP组织效果:
2.验证后的有效数据组织效果:
3.爬取的数据需要用于论文实验不做展示。













