百度API批量识别表格与文字(记录)
1.安装百度识别python包。pip install baidu-aip

2.申请ID、KEY等。打开百度,查看全部产品,找到AI.

注册、登录百度智能云。

找到文字识别,创建应用,申请到ID及两个KEY:

3.简单代码实现。
from aip import AipOcr
#
""" 你的 APPID AK SK """
APP_ID = '1530xxxx'
API_KEY = 'fXp9dxxxxxxxxxxxxxxxxxxx'
SECRET_KEY = 'Ltvxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
#
""" 读取图片 """
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
path='C:/Users/Administrator/Desktop/ocr'#设定目录集中存放
img = get_file_content(path+'/'+u'eng2.png')
#
#文本识别
res=client.general(img)#结果为:字典》列表》字典三级结构,分行识别
strs=''
for item in res['words_result']:#提取结果
strs+=item['words']
print(strs)
#
4.批量识别表格与文字。
# -*- coding: UTF-8 -*-
from aip import AipOcr
import time
import os
import requests
""" 你的 APPID AK SK """
APP_ID = '1544xxxx'
API_KEY = 'VAg0XiGxxxxxxxxxxxxxmoy4'
SECRET_KEY = '3GuciPY1Zxxxxxxxxxxxxxxxxxxxxxxo'
client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
""" 读取图片 """
def get_file_content(filePath):
with open(filePath, 'rb') as fp:
return fp.read()
def write_txt(x_fname,x_fstr):#字符串写入文件
#time1=time.strftime("%Y-%m-%d")
wt=open(x_fname+'.txt','w')
wt.write(x_fstr.encode('utf8'))
print u'保存文件成功:'+x_fname+'.txt'
wt.close()
return
def file_download(url, wf_name):
r = requests.get(url)
print u'已获取网络对象'
#print type(r)#调试点
#print wf_name#调试点
with open(wf_name+'.xls', 'wb') as f:
f.write(r.content)
print u'下载文件成功:'+wf_name+'.xls'
return
def ocr(img,wf_name):
res=client.tableRecognitionAsync(img)
print res
try:
req_id=res['result'][0]['request_id']#表格图片分析后的ID,获取
print u'ID已获取:',req_id
while True:
print u'暂停3秒...读取res2'
time.sleep(3)
res2=client.getTableRecognitionResult(req_id)#第二次:通过ID获取表格文件XLS地址
time.sleep(1)
print u'res2读取成功!',res2
if res2.has_key('error_code'):#i不是表格,作为普通文本识别
""" 调用通用文字识别, 图片为本地图片 """
print u'表格识别错误,重新获取文本res'
res=client.general(img)
time.sleep(1)
print u'重新获取文本res成功!'
print res
strs=''
for item in res['words_result']:
strs+=item['words']
write_txt(wf_name,strs)
break
else:
try:
print u'是否完成msg'
msg=res2['result']['ret_msg']
if msg == u'已完成':
url = res2['result']['result_data']
print u'已获取下载地址:',url
print u'文件名:',wf_name
file_download(url, wf_name)
print u'下载完成!'
break
else:
time.sleep(1)
except Exception as e:
print u'错误'
break
else:
print u'等一哈...',
except Exception as e:
print u'出错了....'
def main():
path='C:/Users/Administrator/Desktop/ocr'#设定目录集中存放
isExists=os.path.exists(path) # 判断结果
if not isExists:# 如果不存在则创建目录
os.makedirs(path) # 创建目录操作函数
print u'桌面OCR目录OK!'
while True:
inp=raw_input(u'请将文件拷贝到桌面OCR目录下,输入文件名:\n如果有多个文件,请在ocr目录下新建目录,文件放在该目录下,输入目录名即可(ctrl+c跳出):')
if len(inp)!=0 and os.path.isfile(path+'/'+inp):
print u'找到一个文件!识别开始...'
img = get_file_content(path+'/'+inp)
print u'获取长度:'+str(len(img))
wf_name =path+'/'+inp.split('.')[0]
ocr(img,wf_name)
break
if len(inp)!=0 and os.path.isdir(path+'/'+inp):
print u'找到一个目录!'
path=path+'/'+inp
list1=os.listdir(path)
if len(list1)>0:
print u'找到'+str(len(list1))+u'个文件!识别开始....'
n=0
for name in list1:
img = get_file_content(path+'/'+name)
wf_name=path+'/'+name.split('.')[0]
ocr(img,wf_name)
n+=1
print u'共写'+str(n)+u'个文件!'
break
print u'文件不存在!'
if __name__ == '__main__':
main()
网址链接https://blog.csdn.net/woho778899/article/details/89382954