Python3使用腾讯云API接口实现音频文件转写(支持本地音频或者url音频)
本文参考链接:
- https://blog.csdn.net/wangliuqi123/article/details/80527417
- https://blog.csdn.net/wangliuqi123/article/details/80537635
之前分别用了百度和科大讯飞的接口来实现音频文件的转写。 百度的几乎是免费的,但是效果实在有点差强人意,科大讯飞的效果倒是很不错,但是有一点,“贵”。于是乎,想着有没有一个折中的方案:识别效果不能太差,但是价钱也最好不要太贵。转来转去,于是转到腾讯这儿来了。
腾讯的语音识别也是有 Python SDK的,链接在此:https://cloud.tencent.com/document/product/1093/35731,Demo的下载地址在这个页面就有,不过暂时只有Python2的,虽说它自己也说后续会支持Python3,但是如果能早点用Python3的话,又何乐而不为?只不过Demo改改确实需要踩很多坑。
它的Demo里面主要由三个Python文件,Config.py, OfflineCLient.py, offlineSdk.py。针对在Python3环境下需要做的修改,我进行部分说明:
Config.py 其实没什么好改的,主要是 SECRET_KEY, SECRETID, APPID 用自己申请的填好就是
# -*- coding:utf-8 -*-
'''
Created on 2019-4-28
@author: iantang
'''
class Config:
'全局变量配置信息,请按需求改成自己的配置'
# ------------- Required,必须填写 ---------------
# AppId, secretId, secretKey获取方法可参考截图:
# https://cloud.tencent.com/document/product/441/6203
# 具体路径:点控制台右上角您的账号-->选:访问管理-->点左边菜单的:访问秘钥-->API秘钥管理
SECRET_KEY = '***'
SECRETID = '*****'
APPID = '***'
# 我们会将识别结果通过Post方式发送至这个URL。用户需要先搭建好自己的用于接收post数据的服务。
CALLBACK_URL = "http://xxx.xxxxx.xxx" # 这个回调url是必填,至于怎么填,我后续再说
# ------------- optional,根据自身需求配置值 ---------------
# 识别引擎 8k_0 or 16k_0 or 8k_6, 8k_6支持角色分离,所以我这儿用的这个识别引擎
ENGINE_MODEL_TYPE = '8k_6'
# 1 or 2. 语音声道数。在电话 8k通用模型下支持 1和 2,其他模型仅支持 1声道
CHANNEL_NUM = 1
# 识别结果文本编码方式 0:UTF-8, 1:GB2312, 2:GBK, 3:BIG5
RES_TEXT_FORMAT = 0
# 语音数据来源。0:语音 URL;1:语音数据(post body)
SOURCE_TYPE = 0
# ------------- optional,采用默认值即可 ---------------
# 腾讯云项目 ID, 填0。也可改成用户的:控制台-账号中心-项目管理中的配置。
PROJECT_ID = 0
# 子服务类型。0:离线语音识别。
SUB_SERVICE_TYPE = 0
# 结果返回方式。0:同步返回;1:异步返回。目前只支持异步返回
RES_TYPE = 1
# 腾讯服务器的URL,通常无需修改。
REQUEST_URL = "https://aai.qcloud.com/asr/v1/"
# 注册签名时用的URL,通常无需修改。
SIGN_URL = "aai.qcloud.com/asr/v1/"
# ------------- 下面是初始化和验证方法,可跳过 ---------------
def __init__(self):
print ("")
def verifyProperties(self):
if len(str(self.SECRET_KEY)) == 0:
print('SECRET_KEY can not empty')
return
if len(str(self.SECRETID)) == 0:
print('SECRETID can not empty')
return
if len(str(self.APPID)) == 0:
print('APPID can not empty')
return
if len(str(self.CALLBACK_URL)) == 0:
print('CALLBACK_URL can not empty')
return
if len(str(self.ENGINE_MODEL_TYPE)) == 0 or (
str(self.ENGINE_MODEL_TYPE) != '8k_0' and str(self.ENGINE_MODEL_TYPE) != '16k_0' and str(self.ENGINE_MODEL_TYPE) != '8k_6'):
print('ENGINE_MODEL_TYPE is not right')
return
if len(str(self.CHANNEL_NUM)) == 0 or (str(self.CHANNEL_NUM) != '0' and str(self.CHANNEL_NUM) != '1'):
print('CHANNEL_NUM is not right')
return
if len(str(self.RES_TEXT_FORMAT)) == 0 or (str(self.RES_TEXT_FORMAT) != '0' and str(self.RES_TEXT_FORMAT) != '1' and str(
self.RES_TEXT_FORMAT) != '2' and str(self.RES_TEXT_FORMAT) != '3'):
print('RES_TEXT_FORMAT is not right')
return
if len(str(self.SOURCE_TYPE)) == 0 or (str(self.SOURCE_TYPE) != '0' and str(self.SOURCE_TYPE) != '1'):
print('SOURCE_TYPE is not right')
return
if len(str(self.PROJECT_ID)) == 0:
print('self.PROJECT_ID can not empty')
return
if len(str(self.SUB_SERVICE_TYPE)) == 0 or (str(self.SUB_SERVICE_TYPE) != '0' and str(self.SUB_SERVICE_TYPE) != '1'):
print('SUB_SERVICE_TYPE is not right')
return
if len(str(self.RES_TYPE)) == 0 or (str(self.RES_TYPE) != '0' and str(self.RES_TYPE) != '1'):
print('RES_TYPE is not right')
return
config = Config()
config.verifyProperties()
offlineSdk.py 中修改的地方有一部分,我已经在源码中进行了注释
# -*- coding:utf-8 -*-
import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse # 新增
import Config
def task_process(audio_url):
request_data = dict()
request_data['channel_num'] = Config.config.CHANNEL_NUM
request_data['secretid'] = Config.config.SECRETID
request_data['engine_model_type'] = Config.config.ENGINE_MODEL_TYPE
request_data['timestamp'] = int(time.time())
request_data['expired'] = int(time.time()) + 3600
request_data['nonce'] = 6666
request_data['projectid'] = Config.config.PROJECT_ID
request_data['callback_url'] = Config.config.CALLBACK_URL
request_data['res_text_format'] = Config.config.RES_TEXT_FORMAT
request_data['res_type'] = Config.config.RES_TYPE
request_data['source_type'] = Config.config.SOURCE_TYPE
request_data['sub_service_type'] = Config.config.SUB_SERVICE_TYPE
# request_data['url'] = urllib.quote(audio_url) 修改为下面的格式
request_data['url'] = audio_url
authorization = generate_sign(request_data)
task_req_url = generate_request(request_data)
header = {
"Content-Type": "application/json",
# "Authorization": str(authorization) Python3中不用进行str转换,否则会认证失败
"Authorization": authorization
}
r = requests.post(task_req_url, headers=header, data=request_data)
return r.text
def generate_sign(request_data):
sign_str = "POST" + Config.config.SIGN_URL + str(Config.config.APPID) + "?"
sort_dict = sorted(request_data.keys())
for key in sort_dict:
# sign_str = sign_str + key + "=" + urllib.unquote(str(request_data[key])) + '&' urllib改为urllib.parse
sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
sign_str = sign_str[:-1]
# authorization = base64.b64encode(hmac.new(Config.config.SECRET_KEY, sign_str, hashlib.sha1).digest()) 修改为下面这种
authorization = base64.b64encode(hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
return authorization
def generate_request(request_data):
result_url = Config.config.REQUEST_URL + str(Config.config.APPID) + "?"
for key in request_data:
result_url = result_url + key + "=" + str(request_data[key]) + '&'
result_url = result_url[:-1]
return result_url
if __name__ == '__main__':
# 语音 URL,公网可下载。当 source_type值为 0时须填写该字段,为 1时不填;长度大于 0,小于 2048
audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
task_process(audio_url)
OfflineCLient.py也没什么修改的, audio_url得用公网可访问的url地址,然后运行他就好了。
# -*- coding:utf-8 -*-
import offlineSdk
import Config
# 说明:请先将Config.py中的配置项按需改成自己的值,然后再开始使用。
# 音频文件路径。每调用一次task_process方法,可发出一份请求。
# 语音 URL,公网可下载。当 source_type值为 0时须填写该字段,为 1时不填;长度大于 0,小于 2048
audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# 调用语音识别函数获得识别结果
result = offlineSdk.task_process(audio_url)
print (result)
# ------------------------------------------------------------------------------------
# 若需中途调整参数值,可直接修改,然后继续发请求即可。比如:
# Config.config.CALLBACK_URL = ""
# Config.config.ENGINE_MODEL_TYPE = "16k_0"
# ......
# audio_url = "https://xuhai2-1255824371.cos.ap-chengdu.myqcloud.com/test.wav"
# result = offlineSdk.task_process(audio_url)
# print (result)
然后你会发现运行结果是这样的,看起来有个success,好像是成功了:

这个时候你肯定会问:我录音的识别结果在哪儿呢?怎么没返回回来?
你肯定还记得 Config.py 的配置里面有一个 CALLBACK_URL = “http://xxx.xxxxx.xxx” 参数,而且根据腾讯的语音识别API文档,这个还是必填的参数,那怎么办? 腾讯是以回调的方式返回识别结果的,而不是像讯飞那样以轮询的方式查看是否有识别结果返回,所以得自己搭建服务处理腾讯回调的POST请求。当然,解决这个问题最简单的方法就是,你可以用 flask 写一个处理回调的路由函数,我把我自己做的数据解析放这儿了,如果觉得有用请自取:
import json
from flask import Flask, request
app = Flask(__name__)
@app.route('/data', methods=['POST'])
def testpost():
if request.method == 'POST':
# print('腾讯响应了')
print(request.form)
if request.form['message'] == '成功':
filename = request.form['audioUrl'].split('/')[-1].split('.')[0] # 获取文件名
txt_file = filename + ".txt" # 转写结果保存的TXT文件
doc = open(txt_file, 'w', encoding='utf-8')
recognition_text = request.form['text']
sentence_list = recognition_text.split('\n')[0:-1] # 列表最后一个元素是空字符串
for sentence in sentence_list:
content = sentence.split(' ')[1] # 获取单句通话内容
begin_time = sentence.split(' ')[0].split(',')[0][1:] # 获取每句话的开始时间
begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
end_time = sentence.split(' ')[0].split(',')[1] # 获取每句话的结束时间
end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 获取说话人
print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time)
print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
doc.close()
dict = {
"code": 0,
"message": "成功"
}
else:
dict = {
"code": 1,
"message": "失败"
}
print(dict)
return json.dumps(dict)
if __name__ == '__main__':
app.run(host="0.0.0.0", port=9979, threaded=True)
当然,如果你有公网可访问的 IP 地址或者域名的话,在Linux上执行上面这段程序:

然后修改CALLBACK_URL 的值为CALLBACK_URL = ‘http://你的公网IP:9979/data’ ,然后用 Python3 运行 OfflineCLient.py ,过一会儿你就可以在 Linux 这边收到录音识别的结果了。
如果没有公网可访问的 IP 或者域名,请参考:https://blog.csdn.net/wangliuqi123/article/details/80537635 进行配置。
如果你想上传本地的语音文件进行识别,请参考下面的demo:
# -*- coding: utf-8 -*-
"""
@author: Looking
@email: 2392863668@qq.com
"""
import os
import requests
import hashlib
import time
import hmac
import base64
import urllib
import urllib.parse
import json
import base64
import Config
import random
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.asr.v20190614 import asr_client, models
appid = Config.config.APPID
req_url = "https://aai.qcloud.com/asr/v1/"
callback_url = "http://*******:9979/data" # 需要搭建接收post数据的服务
sign_url = "aai.qcloud.com/asr/v1/"
secret_id = Config.config.SECRETID
secret_key = Config.config.SECRET_KEY
def task_process(audio_url):
request_data = dict()
request_data['channel_num'] = 1
request_data['secretid'] = secret_id
request_data['engine_model_type'] = "8k_6"
request_data['timestamp'] = int(time.time())
request_data['expired'] = int(time.time()) + 3600
request_data['nonce'] = 1559
request_data['projectid'] = 0
request_data['callback_url'] = callback_url
request_data['res_text_format'] = 0
request_data['res_type'] = 1
request_data['source_type'] = 1
request_data['sub_service_type'] = 0
with open(audio_url, 'rb') as f:
body_data = f.read()
body_len = str(len(body_data))
authorization = generate_sign(request_data, appid)
task_req_url = generate_request(request_data, appid)
header = {
"Authorization": authorization,
"Content-Length": body_len
}
r = requests.post(task_req_url, headers=header, data=body_data)
# print(task_req_url)
# print(r.text)
return r.text
def generate_sign(request_data, appid):
sign_str = "POST" + sign_url + str(appid) + "?"
sort_dict = sorted(request_data.keys())
for key in sort_dict:
sign_str = sign_str + key + "=" + urllib.parse.unquote(str(request_data[key])) + '&'
sign_str = sign_str[:-1]
authorization = base64.b64encode(
hmac.new(bytes(Config.config.SECRET_KEY, 'utf-8'), bytes(sign_str, 'utf-8'), hashlib.sha1).digest())
# authorization = base64.b64encode(hmac.new(secret_key, sign_str, hashlib.sha1).digest())
return authorization
def generate_request(request_data, appid):
result_url = req_url + str(appid) + "?"
for key in request_data:
result_url = result_url + key + "=" + str(request_data[key]) + '&'
result_url = result_url[:-1]
return result_url
def get_requestId(audio_file_path):
request_result = task_process(audio_file_path)
print(request_result)
requestId = eval(request_result)["requestId"]
return requestId
def get_recognition_result(requestId):
try:
cred = credential.Credential(Config.config.SECRETID, Config.config.SECRET_KEY)
httpProfile = HttpProfile()
httpProfile.endpoint = "asr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = asr_client.AsrClient(cred, "ap-guangzhou", clientProfile)
while True:
req = models.DescribeTaskStatusRequest()
# 537731632
params = '{"TaskId":%s}' % requestId
req.from_json_string(params)
resp = client.DescribeTaskStatus(req)
recognition_text = json.loads(resp.to_json_string())
recognition_status = recognition_text['Data']['StatusStr']
if recognition_status == "success":
print(recognition_text['Data']['TaskId'], "识别成功!")
break
if recognition_status == "failed":
raise TencentCloudSDKException
time.sleep(1)
# print(recognition_text)
recognition_text = recognition_text['Data']['Result']
sentence_list = recognition_text.split('\n')[0:-1] # 列表最后一个元素是空字符串
for sentence in sentence_list:
content = sentence.split(' ')[1] # 获取单句通话内容
begin_time = sentence.split(' ')[0].split(',')[0][1:] # 获取每句话的开始时间
begin_time = str(int(begin_time.split(":")[0]) * 60000 + int(begin_time.split(":")[1].replace(".", "")))
end_time = sentence.split(' ')[0].split(',')[1] # 获取每句话的结束时间
end_time = str(int(end_time.split(":")[0]) * 60000 + int(end_time.split(":")[1].replace(".", "")))
speaker = sentence.split(' ')[0].split(',')[-1][:-1] # 获取说话人
print(speaker + "\t" + content + '\t' + begin_time + '\t' + end_time)
# print(speaker + "\t" + content + '\t' + filename + '\t' + begin_time + '\t' + end_time, file=doc)
except TencentCloudSDKException as err:
print(err)
if __name__ == '__main__':
audio_file_path = r"D:\MyProject\Python\audio_recognition\audio\o2020031309513910300127.wav"
requestId = get_requestId(audio_file_path)
get_recognition_result(requestId)
通过回调返回的结果除了没有 audioUrl 参数之外,其他部分与直接使用录音的 url 返回的数据是一样的。