2021-03-06
python3如何获取在线更新代理IP?
利用python3获取在线更新代理IP,发现一般网站对IP的访问都会有一定的限制,所以我们需要使用IP代理的功能。这里有一个实现免费代理IP的例子。可以根据以下代码,获取正在实时更新的代理ip,大家不妨尝试下!#!/usr/bin/python3# -*- coding: utf-8 -*-import reimport sysimport timeimport datetimeimport threadingfrom random import choiceimport requestsimport bs4 class Proxy: def __init__(self, url='http://www.xicidaili.com/nn', header='', user_agent=''): self.url = url self.header = header self.user_agent = user_agent def getIpList(self): # 获取代理IP(取当前页的ip列表,每页100条ip) url = self.url headers = self.header r = requests.get(url, headers=headers) soup = bs4.BeautifulSoup(r.text, 'html.parser') data = soup.table.find_all("td") # 匹配规则需要用浏览器的开发者工具进行查看 # 匹配IP:<td>208.135.217.21</td> ip_compile = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td>') # 匹配端口:<td>808</td> port_compile = re.compile(r'<td>(\d+)</td>') # 获取所有IP,返回的是数组[] ip = re.findall(ip_compile, str(data)) # 获取所有端口:返回的是数组[] port = re.findall(port_compile, str(data)) # 组合IP+端口,如:125.135.217.7:808 return [":".join(i) for i in zip(ip, port)] # 打开页面。执行操作 def done(self, code=0, ips=[]): try: # 随机选取一个ip ip = choice(ips) except: return False else: proxies = { "http": ip, } headers_ = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Referer": "https://best.zhaopin.com/", "User-Agent": choice(self.user_agent), } try: # url link = '' requests.get(link, headers=headers_, proxies=proxies, verify=False) except requests.exceptions.ConnectionError: print("Connection Error") if not ips: print("not ip") sys.exit() # 删除不可用 if ip in ips: ips.remove(ip) # 重新请求 self.done(code, ips) else: date = datetime.datetime.now().strftime('%H:%M:%S') print(u"第%s次 [%s] [%s]: (剩余可用代理IP数:%s)" % (code, date, ip, len(ips))) if __name__ == '__main__': url = 'http://www.xicidaili.com/nn' user_agent = [ "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0; Baiduspider-ads) Gecko/17.0 Firefox/17.0", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9b4) Gecko/2008030317 Firefox/3.0b4", "Mozilla/5.0 (Windows; U; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; BIDUBrowser 7.6)", "Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.99 Safari/537.36", "Mozilla/5.0 (Windows NT 6.3; Win64; x64; Trident/7.0; Touch; LCJB; rv:11.0) like Gecko", ] headers = {"Accept": "text/html,application/xhtml+xml,application/xml;", "Accept-Encoding": "gzip, deflate, sdch", "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6", "Referer": "http://www.xicidaili.com", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36" } proxy = Proxy(url, headers, user_agent) ips = [] # python3把xrange()与rang()e整合为一个range() for i in range(500): # 每隔1000次重新获取一次最新的代理IP if i % 1000 == 0: ips.extend(proxy.getIpList()) # 启用线程,隔2秒产生一个线程 t1 = threading.Thread(target=proxy.done, args=(i, ips)) t1.start() # time.sleep的最小单位是毫秒 time.sleep(2)带入到Python中,选定自己想要爬取的代理,就可以实现实时爬取ip了~
2021-03-06
代理服务器ip如何爬取指定网站?
该代码是根据上一个爬行代理知识产权代码写的。当我们从国内高匿代理知识产权网站爬到生存时间相对较长的知识产权地址,即知识产权地址已存入知识产权时,我们可以使用爬行知识产权作为代理知识产权,进一步爬行网站所需的知识产权。实现代码如下:from bs4 import BeautifulSoupimport reimport timeimport requestsimport randomfrom fake_useragent import UserAgent def get_ip_list(): f = open('IP.txt','r') ip_list = f.readlines() # print(ip_list) f.close() return ip_list def get_random_ip(ip_list): proxy_ip = random.choice(ip_list) proxy_ip = proxy_ip.strip('\n') proxies = {'https': proxy_ip} return proxies def get_content(url, ip_list): print("get_content函数执行!") try: try: time.sleep(1) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} print(proxies) req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)#, proxies=proxies,headers=headers,timeout=20 print("requests请求成功") except: print("重新运行") time.sleep(10) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40) except: print("第二次重新运行") time.sleep(15) proxies = get_random_ip(ip_list) headers = {'User-Agent':str(UserAgent().random)} req = requests.get(url=url, proxies=proxies,headers=headers) req.encoding = 'utf-8' soup = BeautifulSoup(req.text, 'lxml') ips = soup.find_all('tr') ip_final_list = [] for i in range(1, len(ips)): ip_info = ips[i] tds = ip_info.find_all('td') if not tds[7].find('div',class_='bar_inner fast') == None: #out = re.findall('<td>(.*?)</td>', str(tds[8])) if tds[8].text.find('天') != -1: tb = tds[8].text tb = tb[:-1] if int(tb) > 10: ip_final_list.append(tds[1].text + ':' + tds[2].text) #print(out) return ip_final_list if __name__ == '__main__': for i in range(1,2): url = 'http://www.xicidaili.com/wt/{}'.format(i) ip_list = get_ip_list() for ip in get_content(url, ip_list): f2 = open('NewFile.txt','a+') f2.write('http://'+ip) f2.write('\n') f2.close()在此放一个截图,满足的IP地址类似于:如果不同时满足两个条件,如何获得连接时间,如果是绿色的呢?这要观察其要素。我观察到,所有绿色的class都是bar_inerfast,可以再放一张屏幕截图,说明清楚。橙色和黄色的class分别是bar_inermedium和bar_inerslow,可以根据class的价格来判断是否是必要的IP地址。大家如果想使用代理服务器ip爬取到想要的指定网站,不妨利用上述代码进行尝试下,希望对大家所有帮助!
2021-03-06
如何用python爬虫代理ip爬取网页数据?
在网络行销时代,许多模式已不能适应互联网新时代,常常无法达到行销效果,要想更好地运作网络行销,需要借助许多行销工具,做好每一步。与网络问答推广一样,代理IP的支持也是不可或缺的。必须在营销过程中寻找最有效的工具,提高效率,使网络营销效果最大化。使用Python对网页表格数据进行爬行的代码如下。'''Python 3.x描述:本DEMO演示了使用爬虫(动态)代理IP请求网页的过程,代码使用了多线程逻辑:每隔5秒从API接口获取IP,对于每一个IP开启一个线程去抓取网页源码'''import requestsimport timeimport threadingfrom requests.packages import urllib3ips = []# 爬数据的线程类class CrawlThread(threading.Thread): def __init__(self,proxyip): super(CrawlThread, self).__init__() self.proxyip=proxyip def run(self): # 开始计时 start = time.time() #消除关闭证书验证的警告 urllib3.disable_warnings() #使用代理IP请求网址,注意第三个参数verify=False意思是跳过SSL验证(可以防止报SSL错误) html=requests.get(url=targetUrl, proxies={"http" : 'http://' + self.proxyip, "https" : 'https://' + self.proxyip}, verify=False, timeout=15).content.decode() # 结束计时 end = time.time() # 输出内容 print(threading.current_thread().getName() + "使用代理IP, 耗时 " + str(end - start) + "毫秒 " + self.proxyip + " 获取到如下HTML内容:\n" + html + "\n*************")# 获取代理IP的线程类class GetIpThread(threading.Thread): def __init__(self,fetchSecond): super(GetIpThread, self).__init__() self.fetchSecond=fetchSecond def run(self): global ips while True: # 获取IP列表 res = requests.get(apiUrl).content.decode() # 按照\n分割获取到的IP ips = res.split('\n') # 利用每一个IP for proxyip in ips: if proxyip.strip(): # 开启一个线程 CrawlThread(proxyip).start() # 休眠 time.sleep(self.fetchSecond)if __name__ == '__main__': # 获取IP的API接口 apiUrl = "http:xxxx" # 要抓取的目标网站地址 targetUrl = "http://ip.chinaz.com/getip.aspx" # 获取IP时间间隔,建议为5秒 fetchSecond = 5 # 开始自动获取IP GetIpThread(fetchSecond).start()本文介绍了用python爬虫代理ip爬取网页数据的方法。让我们浏览了解更多!