掘金 后端 ( ) • 2024-03-30 23:39
from lxml import etree
import requests,random

class ip_proxy:
    arr = [] # 存放有效的ip
    headers = {
        'User-Agen': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.0.0'
    }

    # 检查ip是否有效
    @classmethod
    def check_valid(cls, ip_port):
        response = requests.get(url='https://www.baidu.com/', headers=cls.headers, proxies={'http': ip_port})
        if response.status_code == 200:
            return True
        return False

    # 获取ip列表
    @classmethod
    def get_ip_list(cls):
        # 分页抓取
        for page in range(1, 6):
            response = requests.get(url=f'http://www.ip3366.net/?stype=1&page={page}', headers=cls.headers)
            html_tree = etree.HTML(response.text)
            tr_tags = html_tree.xpath('//div[@id="list"]/table/tbody/tr')

            # 提取ip
            for tr in tr_tags:
                td_tags = tr.xpath('./td')
                ip = td_tags[0].text.strip()
                port = td_tags[1].text.strip()
                ip_port = 'http://' + ip + ':' + port
                if cls.check_valid(ip_port):
                    cls.arr.append(ip_port)
        return cls.arr

    # 获取ip代理
    @classmethod
    def get_proxy(cls):
        ip_list = cls.get_ip_list()
        count = len(ip_list)

        proxy = {}
        while True:
            index = random.randint(0, count-1)
            ip_address = ip_list[index]

            if cls.check_valid(ip_address):
                proxy = {'http' : ip_address}
                break
        return proxy

if __name__ == '__main__':
    resp = ip_proxy.get_proxy()
    print(resp)