1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
| import sys import getopt import requests from bs4 import BeautifulSoup import re
def start(argv): url = "" pages = "" if len(sys.argv) < 2: print("-h 帮助信息;\n") sys.exit() try: opts,args = getopt.getopt(argv,"-u:-p:-h") except getopt.GetoptError: print('Error an argument!') sys.exit() for opt,arg in opts: if opt == "-u": url = arg elif opt == "-p": pages = arg elif opt == "-h": print(usage())
launcher(url,pages)
def usage(): print('-h: --help 帮助;') print('-u: --url 域名;') print('-p: --pages 页数;') print('eg: python -u "www.baidu.com" -p 100' + '\n') sys.exit()
def launcher(url,pages): email_num = [] key_words = ['email','mail','mailbox','邮件','邮箱','postbox'] for page in range(1,int(pages)+1): for key_word in key_words: bing_emails = bing_search(url,page,key_word) baidu_emails = baidu_search(url,page,key_word) sum_emails = bing_emails + baidu_emails for email in sum_emails: if email in email_num: pass else: print(email) with open('data.txt','a+') as f: f.write(email + '\n') email_num.append(email)
def bing_search(url,page,key_word): referer = "http://cn.bing.com/search?q=email+site%3abaidu.com&qs=n&sp=-1&pq=emailsite%3abaidu.com&first=1&FORM=PERE1" conn = requests.session() bing_url = "https://cn.bing.com/search?q="+key_word+"site%3a"+url+"&qs=n&sp=-1&pq="+key_word+"site%3a"+url+"&first="+str((page-1)*10)+"&FORM=PERE1" conn.get('http://cn.bing.com',headers=headers(referer)) r = conn.get(bing_url,stream=True,headers=headers(referer),timeout=8) emails = search_email(r.text) return emails
def baidu_search(url,page,key_word): email_list = [] emails = [] referer = "https://www.baidu.com/s?wd=email+site%3Abaidu.com&pn=1" baidu_url = "https://www.baidu.com/s?wd="+key_word+"+site%3A"+url+"&pn="+str((page-1)*10) conn = requests.session() conn.get(referer,headers=headers(referer)) r = conn.get(baidu_url, headers=headers(referer)) soup = BeautifulSoup(r.text, 'lxml') tagh3 = soup.find_all('h3') for h3 in tagh3: href = h3.find('a').get('href') try: r = requests.get(href, headers=headers(referer),timeout=8) emails = search_email(r.text) except Exception as e: pass for email in emails: email_list.append(email) return email_list
def search_email(html): emails = re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+",html,re.I) return emails
def headers(referer): headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36', 'Accept': '*/*', 'Accept-Language':'en-US,en;q=0.5', 'Accept-Encoding':'gzip,deflate', 'Referer':referer} return headers
if __name__ == '__main__': try: start(sys.argv[1:]) except KeyboardInterrupt: print("interrupted by user,killing all threads...")
|