新都在

新都在

Python之Email邮箱账号抓取

29
2021-10-10
Python之Email邮箱账号抓取

Python之Email邮箱账号抓取

实现对关键字内容获取到的账号进行抓取并根据指定邮箱后缀进行过滤

实现代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent


def get_http_headers():
    try:
        ua = UserAgent()
        header = {
            "User-Agent": ua.random,
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7,zh-TW;q=0.6"
        }
    except Exception as e:
        header = {
            u'User-Agent': u'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'}
    return header


class EmailAccountGrabbing:
    def __init__(self, keyword="test", email_count=2, grabbing_engine="https://www.bing.com/search", email_suffix="hotmail.com"):
        """
        邮箱账号抓取初始化
        :param keyword: 关键字,不能为空,例如行业关键字、内容关键字等
        :param email_count: 抓取页数,默认每页10条信息,总抓取账号数<=email_count*10,需要注意email_count过大会导致连接超时
        :param grabbing_engine: 抓取账号的搜索引擎,默认为bing引擎,可以修改为google的引擎,但是修改后需要配置代理,否则无法正常访问
        :param email_suffix: 邮箱后缀,用户过滤账号使用。例如: qq.com / hotmail.com
        """
        self.keyword = keyword
        self.email_count = email_count
        self.grabbing_engine = grabbing_engine
        self.email_suffix = email_suffix

    def grabbing(self):
        """
        账号抓取服务
        :return: .
        """
        email_grabbing_result = []
        for i in range(1, self.email_count):
            j = 1
            if i != 1:
                j = i*10+1
            params = {
                "q": "%s @%s" % (self.keyword, self.email_suffix),
                "go": "搜索",
                "qs": "bs",
                "form": "QBRE",
                "first": j
            }
            cookies = {
                "SRCHHPGUSR": "NRSLT=50"
            }
            ret = requests.get(self.grabbing_engine, headers=get_http_headers(), params=params, cookies=cookies)
            soup = BeautifulSoup(ret.text, "html.parser")
            nodes = soup.find_all(class_="b_caption")
            re_email = re.compile('\w+[a-zA-Z0-9_.\-]*@%s' % self.email_suffix)
            for node in nodes:
                emails = re_email.findall(node.text)
                for email in emails:
                    email_grabbing_result.append({
                        "grabbingEngine": ret.url,
                        "emailAddress": email,
                        "keyword": self.keyword
                    })
        return email_grabbing_result


if __name__ == '__main__':
    grabbing_result = EmailAccountGrabbing(keyword="led", email_suffix="qq.com").grabbing()
    print(grabbing_result)

结果

[{
	'grabbingEngine': 'https://cn.bing.com/search?q=led+%40qq.com&go=%E6%90%9C%E7%B4%A2&qs=bs&form=QBRE&first=1',
	'emailAddress': '49768913@qq.com',
	'keyword': 'led'
}, {
	'grabbingEngine': 'https://cn.bing.com/search?q=led+%40qq.com&go=%E6%90%9C%E7%B4%A2&qs=bs&form=QBRE&first=1',
	'emailAddress': 'syxled.com865176079@qq.com',
	'keyword': 'led'
}, {
	'grabbingEngine': 'https://cn.bing.com/search?q=led+%40qq.com&go=%E6%90%9C%E7%B4%A2&qs=bs&form=QBRE&first=1',
	'emailAddress': '865176079@qq.com',
	'keyword': 'led'
}]