最近研究了下语音合成和语音识别。分别看了一些文章,也下载jdk写了些代码测试了下。
发现,对于语音合成。中文来说,百度语音和科大讯飞,基本都差不多。
英文的话,百度合成出来的效果不佳。科大讯飞稍好点。但是总体都没有国外语音合成好。比如 iSpeech、FreeTTS,可能国外的主语都是英语的缘故吧。
百度日调用额度比较多,据说有2万额度。讯飞每天就500,有点少。iSpeech 是要收费的。FreeTTS 可以离线使用。
百度识别和合成代码:
public class SoundAPI
{
private static final Logger logger = LoggerFactory.getLogger(SoundAPI.class);
final static String FILE_PATH = Config.getString("download.folder");
// 设置APPID/AK/SK
private static final String APP_ID = "你的APP ID";
private static final String API_KEY = "你的key";
private static final String SECRET_KEY = "你的秘钥";
// 初始化一个AipSpeech
private static AipSpeech client = null;
private static long iniTime = 0L;
/** 30 天 24 小时 **/
private static final long MONTH_TIME = 30 * 24 * 60 * 60 * 1000;
private static final Base64 base64 = new Base64();
private static void iniAPI()
{
boolean needToReset = false;
// 判断是否一个月了,如果一个月后,需要重新初始话
long currentTime = System.currentTimeMillis();
if (currentTime - iniTime > MONTH_TIME)
{
needToReset = true;
}
if (client == null || needToReset)
{
client = new AipSpeech(APP_ID, API_KEY, SECRET_KEY);
/** 2秒超时时间 **/
client.setConnectionTimeoutInMillis(2000);
iniTime = System.currentTimeMillis();
}
}
public static String getSoundMp3(String text, String fileName, QuestionTypeEnum questionType)
{
String rtnfileName = "";
String type = "zh";
if (StringUtils.isEmpty(text))
return "";
try
{
iniAPI();
if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
{
type = "en";
}
TtsResponse res = client.synthesis(text, type, 1, null);
byte[] data = res.getData();
if (data != null)
{
// String uuid = UUID.randomUUID().toString().replace("-",
// "").toLowerCase();
String uuid = base64.encodeToString(fileName.getBytes());
rtnfileName = type + "/" + uuid.replaceAll("=", "") + ".mp3";
String path = FILE_PATH + rtnfileName;
File file = new File(path);
if (!file.exists())
{
Util.writeBytesToFileSystem(data, path);
}
} else
{
JSONObject jsonObj = res.getResult();
logger.info("invoke baidu synthesis API error:", jsonObj);
}
} catch (Exception e)
{
rtnfileName = "";
logger.error("invoke baidu synthesis API error:", e);
}
return rtnfileName;
}
public static String recognizeSound(String filePath, QuestionTypeEnum questionType)
{
String result = "";
JSONObject asrRes = null;
if (StringUtils.isEmpty(filePath))
return "";
try
{
iniAPI();
if (QuestionTypeEnum.ENGLISH_WORD.getType().equals(questionType.getType()))
{
HashMap<String, Object> options = new HashMap<>();
options.put("dev_pid", 1737);
asrRes = client.asr(filePath, "pcm", 16000, options);
} else
{
asrRes = client.asr(filePath, "pcm", 16000, null);
}
result = getResult(asrRes);
} catch (Exception e)
{
logger.error("invoke baidu asr API error:", e);
}
return result;
}
private static String getResult(JSONObject asrRes)
{
String result = "";
if (asrRes.getInt("err_no") == 0)
{
JSONArray arrayResult = asrRes.getJSONArray("result");
StringBuilder sbResult = new StringBuilder();
for (int i = 0; i < arrayResult.length(); i++)
{
if (i == 0)
{
sbResult.append(arrayResult.get(i).toString());
} else
{
if (!StringUtils.isEmpty(arrayResult.get(i).toString()))
sbResult.append(";" + arrayResult.get(i).toString());
}
}
result = sbResult.toString().replaceAll(",", "");
} else
{
logger.error("invoke baidu asr API error:", asrRes);
}
return result;
}
科大讯飞的语音识别及合成
public class IatAPI
{
private static final Logger logger = LoggerFactory.getLogger(IatAPI.class);
/**
* 科大讯飞语音识别写入参考
* https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
*/
final static String APPID = "你的APPID";
final static String APPKEY_IAT = "你的秘钥";
final static String URL_IAT = "http://api.xfyun.cn/v1/service/v1/iat";
final static String IP = "服务器IP地址";
/**
*
* 发送语音,获取文字
*
* @param audioByteArray
* @return
* @throws Exception
*/
public static String process(String filePath) throws Exception
{
Map<String, String> header = getHeader("raw", "sms16k");
// 读取音频文件,转二进制数组,然后Base64编码
byte[] audioByteArray = FileUtil.read2ByteArray(filePath);
String audioBase64 = new String(Base64.encodeBase64(audioByteArray), "UTF-8");
String bodyParam = "audio=" + audioBase64;
// logger.info(bodyParam);
String result = HttpUtil.doPost(URL_IAT, header, bodyParam);
return result;
}
/**
* 组装http请求头
*
* @param aue
* @param resultLevel
* @param language
* @param category
* @return
* @throws UnsupportedEncodingException
*/
private static Map<String, String> getHeader(String aue, String engineType) throws UnsupportedEncodingException
{
// 系统当前时间戳
String X_CurTime = System.currentTimeMillis() / 1000L + "";
// 业务参数
String param = "{\"aue\":\"" + aue + "\"" + ",\"engine_type\":\"" + engineType + "\"}";
String X_Param = new String(Base64.encodeBase64(param.getBytes("UTF-8")));
// 接口密钥
String apiKey = APPKEY_IAT;
// 讯飞开放平台应用ID
String X_Appid = APPID;
// 生成令牌
String X_CheckSum = DigestUtils.md5Hex(apiKey + X_CurTime + X_Param);
// 组装请求头
Map<String, String> header = new HashMap<String, String>();
header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
header.put("X-Param", X_Param);
header.put("X-CurTime", X_CurTime);
header.put("X-CheckSum", X_CheckSum);
header.put("X-Appid", X_Appid);
header.put("X-Real-Ip", IP);
return header;
}
public class TtsAPI
{
private static final Logger logger = LoggerFactory.getLogger(TtsAPI.class);
/**
* 科大讯飞语音识别写入参考
* https://github.com/IflytekAIUI/DemoCode/blob/master/webapi/java/Iat.java
*/
final static String APPID = "你的APP id";
final static String APPKEY_TTS = "你的秘钥";
final static String URL_TTS = "http://api.xfyun.cn/v1/service/v1/tts";
final static String IP = "服务器地址";
final static String FILE_PATH = Config.getString("download.folder");
/**
*
* 发送文字,获取语音
*
* @param text
* @throws Exception
*/
public static String process(String text) throws Exception
{
String result = null;
Long startTime = System.currentTimeMillis();
try
{
Map<String, String> header = getHeader("audio/L16;rate=16000", "lame", "xiaoyan", "50", "50", "", "text",
"50");
Map<String, Object> resultMap = HttpUtil.doMultiPost(URL_TTS, header, "text=" + text);
// 合成成功
if ("audio/mpeg".equals(resultMap.get("Content-Type")))
{
FileUtil.save(FILE_PATH, resultMap.get("sid") + ".mp3", (byte[]) resultMap.get("body"));
result = resultMap.get("sid") + ".mp3";
} else
{ // 合成失败
logger.error(resultMap.get("body").toString());
}
} catch (Exception e)
{
logger.error("there is error:", e);
}
Long endTime = System.currentTimeMillis();
logger.info("finish get voice:" + (endTime - startTime));
return result;
}
/**
* 组装http请求头
*
* @param aue
* @param resultLevel
* @param language
* @param category
* @return
* @throws UnsupportedEncodingException
*/
private static Map<String, String> getHeader(String auf, String aue, String voiceName, String speed, String volume,
String engineType, String textType, String pitch) throws UnsupportedEncodingException
{
String curTime = System.currentTimeMillis() / 1000L + "";
StringBuilder param = new StringBuilder("{\"auf\":\"" + auf + "\"");
if (!StringUtil.isNullOrEmpty(aue))
{
param.append(",\"aue\":\"" + aue + "\"");
}
if (!StringUtil.isNullOrEmpty(voiceName))
{
param.append(",\"voice_name\":\"" + voiceName + "\"");
}
if (!StringUtil.isNullOrEmpty(speed))
{
param.append(",\"speed\":\"" + speed + "\"");
}
if (!StringUtil.isNullOrEmpty(volume))
{
param.append(",\"volume\":\"" + volume + "\"");
}
if (!StringUtil.isNullOrEmpty(pitch))
{
param.append(",\"pitch\":\"" + pitch + "\"");
}
if (!StringUtil.isNullOrEmpty(engineType))
{
param.append(",\"engine_type\":\"" + engineType + "\"");
}
if (!StringUtil.isNullOrEmpty(textType))
{
param.append(",\"text_type\":\"" + textType + "\"");
}
param.append("}");
String paramBase64 = new String(Base64.encodeBase64(param.toString().getBytes("UTF-8")));
String checkSum = DigestUtils.md5Hex(APPKEY_TTS + curTime + paramBase64);
Map<String, String> header = new HashMap<String, String>();
header.put("Content-Type", "application/x-www-form-urlencoded; charset=utf-8");
header.put("X-Param", paramBase64);
header.put("X-CurTime", curTime);
header.put("X-CheckSum", checkSum);
header.put("X-Real-Ip", IP);
header.put("X-Appid", APPID);
// logger.info(JSON.toJSONString(header));
return header;
}
原文:https://www.cnblogs.com/liguoyi/p/9231607.html