批量工商信息查询
自动从excel文件中读取待查名单,查询信息,自行根据需要提取相关信息
import requests
from bs4 import BeautifulSoup
from openpyxl import load_workbook
import re
import urllib
import random
def get_user_agent():
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
uer_agent = random.choice(user_agent_list)
return uer_agent
def craw(url):
headers = {
'Host': 'www.qichacha.com',
'User-Agent': get_user_agent(),
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'Referer': 'http://www.qichacha.com/',
'Cookie': r'UM_distinctid***************',
'Connection': 'keep-alive',
'If-Modified-Since': 'Wed, 30 **********',
'If-None-Match': '"59*******"',
'Cache-Control': 'max-age=0',
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
response.encoding = 'utf-8'
print(response.status_code)
print('ERROR')
soup = BeautifulSoup(response.text, 'lxml')
# print(soup)
com_names = soup.find_all(class_='ma_h1')
print(com_names)
res_url = r"(?<=href=\").+?(?=\")|(?<=href=\').+?(?=\')"
link = re.findall(res_url, str(com_names), re.I | re.S | re.M)
return link[0]
def crawler_company(url, company):
request = urllib.request.Request(url)
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
response = urllib.request.urlopen(request)
soup = BeautifulSoup(response, 'lxml')
context = str(soup)
path = "./" + company + ".html"
file = open(path, "wb")
file.write(context.encode("utf-8"))
file.close()
if __name__ == '__main__':
wb = load_workbook('company.xlsx')
sheet = wb.get_sheet_by_name('sheet1')
for rows in range(1,sheet.max_row):
key_word = sheet.cell(row=rows+1, column=2).value
print(key_word,rows)
print('正在搜索,请稍后')
url = r'http://www.qichacha.com/search?key={}#p:{}&'.format(key_word, 1)
s1 = craw(url)
url_company = "http://www.qichacha.com" + s1
crawler_company(url_company, key_word)