最近有个小需求:处理讯飞语音录入内容,解析其中的价格和商品名称。代码写得比较乱,做个小笔记。
package me.ele.napos.goods.api.processor.transformers.tools; import me.ele.arch.etrace.common.util.StringUtils; import me.ele.napos.goods.api.descriptor.exceptions.InvalidParameterException; import me.ele.napos.goods.api.processor.transformers.MenuDataTransformer; import me.ele.napos.vine.Vine; import me.ele.napos.vine.base.logger.Logger; import me.ele.napos.vine.descriptor.payload.exception.ServiceException; import java.lang.reflect.Array; import java.math.BigDecimal; import java.math.RoundingMode; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class VoicePriceRecognition { private final static String NOT_HAS_PRICE_CONTENT="no price"; //private static final Logger vineLogger = Vine.getLogger(VoicePriceRecognition.class); //private static final Character[] PRICE_UNIT = {‘元‘,‘块‘,‘角‘,‘毛‘,‘分‘}; private static final Character[] CN_NUMERIC = { ‘一‘, ‘二‘, ‘三‘, ‘四‘, ‘五‘, ‘六‘, ‘七‘, ‘八‘, ‘九‘, ‘壹‘, ‘贰‘, ‘叁‘, ‘肆‘, ‘伍‘, ‘陆‘, ‘柒‘, ‘捌‘, ‘玖‘, ‘十‘, ‘百‘, ‘千‘, ‘拾‘, ‘佰‘, ‘仟‘, ‘万‘, ‘亿‘, // ‘○‘, ‘O‘, ‘零‘ }; private static final Character[] EN_NUMERIC = {‘0‘,‘1‘,‘2‘,‘3‘,‘4‘,‘5‘,‘6‘,‘7‘,‘8‘,‘9‘}; private static final String BASE_UNIT_YUAN = "元"; private static final String BASE_UNIT_KUAI = "块"; private static final String BASE_UNIT_JIAO = "角"; private static final String BASE_UNIT_MAO = "毛"; private static final String BASE_UNIT_FEN = "分"; private static Map<Character, Integer> cnNumeric = null; private static Map<Character, Integer> enNumeric = null; static { cnNumeric = new HashMap<Character, Integer>(40, 0.85f); for (int j = 0; j < 9; j++) { cnNumeric.put(CN_NUMERIC[j] , j + 1); } for (int j = 9; j < 18; j++) { cnNumeric.put(CN_NUMERIC[j] , j - 8); } cnNumeric.put(‘两‘, 2); cnNumeric.put(‘零‘, 0); cnNumeric.put(‘十‘, 10); cnNumeric.put(‘拾‘, 10); cnNumeric.put(‘百‘, 100); cnNumeric.put(‘佰‘, 100); cnNumeric.put(‘千‘, 1000); cnNumeric.put(‘仟‘, 1000); cnNumeric.put(‘万‘, 10000); cnNumeric.put(‘亿‘, 100000000); for(int i=0;i<EN_NUMERIC.length;i++){ cnNumeric.put(EN_NUMERIC[i],i); } } private static void commonRecognition(String voiceContent,String baseStr, Map<String,String> result) throws ServiceException { String itemName; StringBuffer priceContent = new StringBuffer(); int baseIndex = voiceContent.lastIndexOf(baseStr); if(baseIndex == 0){ throw new InvalidParameterException("NOT_HAS_NAME", "no price"); } //处理块 或 元 之前是 数字或者点 如 1234.5块(元),从块开始 倒着 一位一位的判断是否属于数字或者汉字数字直到非的那一位停止 int baseIndex_ = baseIndex; if(Character.isDigit(voiceContent.charAt(baseIndex-1))){ while (baseIndex >0 && (Character.isDigit(voiceContent.charAt(--baseIndex))||‘.‘ ==voiceContent.charAt(baseIndex))){ priceContent.insert(0,voiceContent.charAt(baseIndex)); } result.put("type","number"); } //处理块 或 元 之前是 是汉字 如 一千二百元点五块(元) else { if(cnNumeric.containsKey(voiceContent.charAt(baseIndex-1))){ while (baseIndex >0 && (cnNumeric.containsKey(voiceContent.charAt(--baseIndex))||‘点‘ == voiceContent.charAt(baseIndex))){ priceContent.insert(0,voiceContent.charAt(baseIndex)); } } result.put("type","cn"); } //如果是汉字和数字的混合体 如 一百元56毛柒捌分 if(baseIndex==0 && cnNumeric.containsKey(voiceContent.charAt(baseIndex))){ throw new InvalidParameterException("NOT_HAS_NAME", "no price"); } itemName = voiceContent.substring(0,baseIndex+1); result.put("name",itemName); priceContent.append(voiceContent.substring(baseIndex_,voiceContent.length())); if(priceContent.length() <2){ throw new InvalidParameterException("NOT_HAS_PRICE",NOT_HAS_PRICE_CONTENT); } result.put("price",priceContent.toString()); getPrice(result); System.out.println(result); } private static void getPrice( Map<String,String> result) throws ServiceException { //按照规则 将部分字符替换 String price = result.get("price"); /* StringBuffer sb = new StringBuffer(price); while(price.indexOf(‘零‘) >-1){ sb.deleteCharAt(price.indexOf(‘零‘)); price = sb.toString(); }*/ price = price .replace("块","元") .replace("毛","角") .replace(‘佰‘, ‘百‘) .replace(‘仟‘, ‘千‘) .replace(‘拾‘, ‘十‘) .replace(‘零‘, ‘ ‘) .replace("两","二").trim(); result.put("price",price); //检验是否具有价格语义 checkemanticAndSetPrice(result); } /** * 只针对千元进行校验,如果以后扩展到万 需要对万之前的特殊校验,beforeWan ,beforeYi * @param beforeYuan * @throws InvalidParameterException */ private static void checkBeforeYuan(String beforeYuan) throws InvalidParameterException { if(beforeYuan == null || beforeYuan.length()<1 || "".equals(beforeYuan)){ return; } boolean invalid = beforeYuan.split("百").length >2 || beforeYuan.split("千").length >2 || beforeYuan.split("十").length >2 || beforeYuan.split("零").length >2; if(invalid){ throw new InvalidParameterException("价格无法识别:"+beforeYuan); } } private static void checkemanticAndSetPrice( Map<String,String> result) throws ServiceException { String targePrice = result.get("price"); //检验单位顺序 和单位之间的值 //1.单位个数是否都为1 如 元 角 分 是否只有一次而且顺序从小到大。按单位分出区间,各个数字区间是否是大小顺序排列。 char[] targetPriceChars = targePrice.toCharArray(); int yuanSum = 0; int jiaoSum = 0; int fenSum = 0; for(char s:targetPriceChars){ if(‘元‘ == s){ yuanSum ++; } if(‘角‘ == s){ jiaoSum ++; } if(‘分‘ == s){ fenSum ++; } } //String [] yuanSplit = targePrice.split("元"); //String [] jiaoSplit = targePrice.split("角"); //String [] fenSplit = targePrice.split("分"); int yuanIndex = targePrice.lastIndexOf(‘元‘); int jiaoIndex = targePrice.lastIndexOf(‘角‘); int fenIndex = targePrice.lastIndexOf(‘分‘); int invalidLength = 2; if(yuanSum>=invalidLength || jiaoSum>=invalidLength || fenSum>=invalidLength){ throw new InvalidParameterException("价格无法识别:"+targePrice); } boolean shunXu = (yuanIndex > jiaoIndex &&jiaoIndex>-1) || (jiaoIndex >fenIndex && fenIndex > -1) || (yuanIndex >fenIndex && fenIndex>-1); if(shunXu){ throw new InvalidParameterException("价格无法识别:"+targePrice); } //用于判断是否是最后一位 boolean last = false; int lastIndex = 0; String beforeYuan=null; String betweenYuanAndJiao=null; String betweenJiaoAndFen = null; String afterCnDian = null; boolean has_dian=false; StringBuffer newPrice =new StringBuffer(); if(yuanIndex > -1){ beforeYuan= targePrice.substring(0,yuanIndex); if(beforeYuan.contains("点")){ int dianIndex = beforeYuan.lastIndexOf(‘点‘); if(dianIndex > -1){ afterCnDian = beforeYuan.substring(dianIndex+1); beforeYuan = beforeYuan.substring(0,dianIndex); } } newPrice.append(beforeYuan).append("元"); lastIndex = yuanIndex; } //beforeyuan需要校验 计量单位 千 百 十 零 是否有多个 checkBeforeYuan(beforeYuan); if(jiaoIndex > -1){ betweenYuanAndJiao = targePrice.substring(yuanIndex+1,jiaoIndex).trim(); lastIndex = jiaoIndex; } if(fenIndex > -1){ betweenJiaoAndFen = targePrice.substring(jiaoIndex >-1 ?jiaoIndex+1:yuanIndex+1,fenIndex).trim(); lastIndex = fenIndex; } Double lastedPrice = 0D; //如果不相等 说明最后还存在没有货币单位的数字存在 if(lastIndex+1 != targePrice.length()){ String lastStr = targePrice.substring(lastIndex+1,targePrice.length()); if(lastStr.length() >1){ lastStr = lastStr.substring(0,1); } //如果是数字 ,如果是可识别为数字的的汉字 if(isCNNumeric(lastStr.charAt(0))>-1){ if(‘元‘ == targePrice.charAt(lastIndex)){ lastedPrice = 0.1*Double.valueOf(isCNNumeric(lastStr.charAt(0))); } if(‘角‘ == targePrice.charAt(lastIndex)){ lastedPrice = 0.01*Double.valueOf(isCNNumeric(lastStr.charAt(0))); } if(‘分‘ == targePrice.charAt(lastIndex)){ lastedPrice =0D; } } last = true; } //重新拼写价格 if(betweenYuanAndJiao !=null && betweenYuanAndJiao.length()>1){ betweenYuanAndJiao = betweenYuanAndJiao.substring(0,1); newPrice.append(betweenYuanAndJiao).append("角"); } if(betweenJiaoAndFen != null && betweenJiaoAndFen.length()>1){ betweenJiaoAndFen = betweenJiaoAndFen.substring(0,1); newPrice.append(betweenJiaoAndFen).append("分"); } if(last){ newPrice.append(targePrice.substring(lastIndex,targePrice.length())); } result.put("price",newPrice.toString()); Double beforeYuanPrice = 0D; if(yuanIndex > -1){ if("cn".equals(result.get("type"))){ beforeYuanPrice = Double.valueOf(cnNumericToArabic(beforeYuan)); }else if("number".equals(result.get("type"))){ try { beforeYuanPrice = Double.valueOf(beforeYuan); }catch (NumberFormatException e){ throw new InvalidParameterException("价格解析错误,无效价格"); } }else if("mixed".equals(result.get("type"))){ } } BigDecimal bigDecimal = new BigDecimal(beforeYuanPrice.toString()); if(betweenYuanAndJiao != null && !"".equals(betweenYuanAndJiao) && isCNNumeric(betweenYuanAndJiao.charAt(0)) >-1){ bigDecimal = bigDecimal.add(BigDecimal.valueOf(isCNNumeric(betweenYuanAndJiao.charAt(0))).multiply(BigDecimal.valueOf(0.1D))); } if(betweenJiaoAndFen != null && !"".equals(betweenYuanAndJiao) &&isCNNumeric(betweenJiaoAndFen.charAt(0)) >-1){ bigDecimal = bigDecimal.add(BigDecimal.valueOf(isCNNumeric(betweenJiaoAndFen.charAt(0))).multiply(BigDecimal.valueOf(0.01D))); } bigDecimal = bigDecimal.add(new BigDecimal(String.valueOf(lastedPrice))); //如果‘点‘ 存在,处理点后面的。//解析两位 角 分 BigDecimal afterCnDianPrice = null; if(afterCnDian != null){ if(afterCnDian.length()>1){ char jiao = afterCnDian.charAt(0); char fen = afterCnDian.charAt(1); afterCnDianPrice= BigDecimal.valueOf(0.1).multiply(BigDecimal.valueOf(isCNNumeric(jiao))).add(BigDecimal.valueOf(0.01).multiply(BigDecimal.valueOf(isCNNumeric(fen)))); } if(afterCnDian.length()==1){ char jiao = afterCnDian.charAt(0); afterCnDianPrice = BigDecimal.valueOf(0.1).multiply(BigDecimal.valueOf(isCNNumeric(jiao))); } } if(afterCnDianPrice != null){ bigDecimal = bigDecimal.add(afterCnDianPrice); } bigDecimal = bigDecimal.setScale(2, BigDecimal.ROUND_HALF_DOWN); result.put("price",String.valueOf(bigDecimal)); } private static int cnNumericToArabic(String cnn) { cnn = cnn.trim(); if (cnn.length() == 1) { return isCNNumeric(cnn.charAt(0)); } int yi = -1, wan = -1, qian = -1, bai = -1, shi = -1; int val = 0; yi = cnn.lastIndexOf(‘亿‘); if (yi > -1) { val += cnNumericToArabic(cnn.substring(0, yi)) * 100000000; if (yi < cnn.length() - 1) { cnn = cnn.substring(yi + 1 , cnn.length()); } else { cnn = ""; } if (cnn.length() == 1) { int arbic = isCNNumeric(cnn.charAt(0)); if (arbic <= 10) { val += arbic * 10000000; } cnn = ""; } } wan = cnn.lastIndexOf(‘万‘); if (wan > -1) { val += cnNumericToArabic(cnn.substring(0, wan)) * 10000; if (wan < cnn.length() - 1) { cnn = cnn.substring(wan + 1 , cnn.length()); } else { cnn = ""; } if (cnn.length() == 1) { int arbic = isCNNumeric(cnn.charAt(0)); if (arbic <= 10) { val += arbic * 1000; } cnn = ""; } } qian = cnn.lastIndexOf(‘千‘); if (qian > -1) { val += cnNumericToArabic(cnn.substring(0, qian)) * 1000; if (qian < cnn.length() - 1) { cnn = cnn.substring(qian + 1 , cnn.length()); } else { cnn = ""; } if (cnn.length() == 1) { int arbic = isCNNumeric(cnn.charAt(0)); if (arbic <= 10) { val += arbic * 100; } cnn = ""; } } bai = cnn.lastIndexOf(‘百‘); if (bai > -1) { val += cnNumericToArabic(cnn.substring(0, bai)) * 100; if (bai < cnn.length() - 1) { cnn = cnn.substring(bai + 1 , cnn.length()); } else { cnn = ""; } if (cnn.length() == 1) { int arbic = isCNNumeric(cnn.charAt(0)); if (arbic <= 10) { val += arbic * 10; } cnn = ""; } } shi = cnn.lastIndexOf(‘十‘); if (shi > -1) { if (shi == 0) { val += 1 * 10; } else { val += cnNumericToArabic(cnn.substring(0 , shi)) * 10; } if (shi < cnn.length() - 1) { cnn = cnn.substring(shi + 1 , cnn.length()); } else { cnn = ""; } } cnn = cnn.trim(); //特殊处理 如 三三三元 处理成为3元,也可以理解为处理成为三百三十三元。 if(cnn.length()>1){ cnn = cnn.substring(0,1); } for (int j = 0; j < cnn.length(); j++) { val += isCNNumeric(cnn.charAt(j)) * Math.pow(10 , cnn.length() - j - 1); } return val; } private static int isCNNumeric(char c) { Integer i = cnNumeric.get(c); if (i == null) { return -1; } return i.intValue(); } private static int isENNumeric(char c) { Integer i = enNumeric.get(c); if (i == null) { return -1; } return i.intValue(); } public static Map<String,String> priceRecognition(String voiceContent) throws ServiceException { //1.非空 verify //vineLogger.info("获取语音输入内容:"+voiceContent); Map<String,String> result = new HashMap<>(); result.put("voiceContent",voiceContent); if(StringUtils.isEmpty(voiceContent)){ throw new InvalidParameterException("NOT_HAS_PRICE",NOT_HAS_PRICE_CONTENT); } //2.识别价格区间 //按照块 和 元 进行基准位置,如果没有块或者元,按照角,毛,分进行 if(voiceContent.contains(BASE_UNIT_YUAN)){ commonRecognition(voiceContent,BASE_UNIT_YUAN,result); } else if(voiceContent.contains(BASE_UNIT_KUAI)){ commonRecognition(voiceContent,BASE_UNIT_KUAI,result); } else if(voiceContent.contains(BASE_UNIT_MAO)){ commonRecognition(voiceContent,BASE_UNIT_MAO,result); }else if(voiceContent.contains(BASE_UNIT_JIAO)){ commonRecognition(voiceContent,BASE_UNIT_JIAO,result); }else if(voiceContent.contains(BASE_UNIT_FEN)){ commonRecognition(voiceContent,BASE_UNIT_FEN,result); }else { //不存在价格 commonRecognition(voiceContent+"元",BASE_UNIT_YUAN,result); //throw new InvalidParameterException("NOT_HAS_PRICE",NOT_HAS_PRICE_CONTENT); } return result; } public static void main (String[] args) throws ServiceException { String s0 = "毛血旺28";//"毛豆炸酱煲仔饭14"; String s1 = "醋0.5元"; String s2 = "西红柿2金一千二百块二分"; String s4 = "西红柿2金12.4元"; String s5 = "西红柿2金2222212.42222块"; String s6 = "西红柿2金一百元56毛柒捌分"; String s7 = "红烧肉一百一百五十五十元五十毛柒捌分"; String s8 = "手抓饼一二千三四百五六十七八元一二毛三四分"; String s9 = "一千二百三十四元五毛六分"; List<String> sb = new ArrayList<>(9); sb.add(s0); sb.add(s1); sb.add(s2); sb.add(s4); sb.add(s5); sb.add(s6); sb.add(s7); sb.add(s8); sb.add(s9); for(String s:sb){ try { priceRecognition(s); }catch (Exception e){ e.printStackTrace(); } } } }