标签:
package test; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.Vector; public class OpinionAnalyser { //倾向词表 public Vector <Word> words=new Vector <Word>(); //修饰词表 public Vector <Word> adjectives=new Vector <Word>(); //描述词表 public Vector <Word> descriptions=new Vector <Word>(); //正面句子数 public int posCount; //负面句子数 public int negCount; static String SERVER="59.77.233.*"; static String USER=""; static String PASSWORD=""; static String DATABASE="skycent"; //负面词的权重,为2表示负面词是正面词权重的两倍 static int NEG_WEIGHT=2; static int TITLE_WEIGHT=10; private static int atoi(String s) { return Integer.parseInt(s); } //读取数据库初始化三个词表和其他成员变量 public void OpinionAnalyser() throws SQLException { ConnDB conndb; PreparedStatement stmt = null; // PreparedStatement stmt = null; ResultSet rs = null; conndb = new ConnDB(SERVER, USER, PASSWORD, DATABASE); conndb.executeUpdate("SET NAMES ‘utf8mb4‘"); //获取倾向性词表 String strSQL = "select word,polar,weight from twordlist"; try { stmt = conndb.getConnection().prepareStatement(strSQL); rs = stmt.executeQuery(); } catch (SQLException e1) { e1.printStackTrace(); } // 处理空集情况 if (rs.next() == false) { System.out.println("twordlist没有词!"); } else{ rs.previous(); } while(rs.next()) { int polar=atoi(rs.getString("polar")); int weight=atoi(rs.getString("weight")); // System.out.println(polar+" "+weight+" "+rs.getString("word")); Word tmp=new Word(rs.getString("word"),polar,weight); words.addElement(tmp); // System.out.println(polar+" "+weight); } //获取描述词表 strSQL = "select word,type from twordlist_ms"; try { stmt = conndb.getConnection().prepareStatement(strSQL); rs = stmt.executeQuery(); } catch (SQLException e1) { e1.printStackTrace(); } // 处理空集情况 if (rs.next() == false) { System.out.println("twordlist_ms没有词!"); } else{ rs.previous(); } while(rs.next()) { int polar=atoi(rs.getString("type")); // System.out.println(polar+" "+rs.getString("word")); Word tmp=new Word(rs.getString("word"),polar,0); descriptions.addElement(tmp); } //获取修饰词表 strSQL = "select word,polar,weight from twordlist_xs"; try { stmt = conndb.getConnection().prepareStatement(strSQL); rs = stmt.executeQuery(); } catch (SQLException e1) { e1.printStackTrace(); } // 处理空集情况 if (rs.next() == false) { System.out.println("twordlist_xs没有词!"); } else{ rs.previous(); } while(rs.next()) { int polar=atoi(rs.getString("polar")); int weight=atoi(rs.getString("weight")); // System.out.println(polar+" "+weight+" "+rs.getString("word")); Word tmp=new Word(rs.getString("word"),polar,weight); adjectives.addElement(tmp); } posCount=0; negCount=0; conndb.close(); } //句子倾向性得分 public int sentenceScore(String sentence) { int opinionScore=0; //是否出现倾向词 int opinionPosition=0; for(int i=0;i<words.size();i++) { //找到倾向性词表 opinionPosition=sentence.indexOf(words.get(i).getWord()); // System.out.println(opinionPosition); if(opinionPosition!=-1) { //是否出现修饰词+倾向词 int flag=0; for(int j=0;j<adjectives.size();j++) { StringBuffer wordPair=new StringBuffer(); wordPair.append(adjectives.get(j).getWord()); wordPair.append(words.get(i).getWord()); int pairPosition =0; pairPosition=sentence.indexOf(wordPair.toString()); if(pairPosition!=-1) { // System.out.println("yeyeyeyey"); flag=1; int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar(); if(tmpScore>0) opinionScore +=tmpScore; else opinionScore +=tmpScore*NEG_WEIGHT; } } //没出现修饰词只计算倾向次本身的权重 if(flag==0) { // System.out.println(opinionPosition); // System.out.println("nnnnnnnnnnnnn"); if(words.get(i).getPolar()==1) { opinionScore+=words.get(i).getWeight()*words.get(i).getPolar(); // System.out.println(words.get(i).getWord()); // System.out.println("wwwwwwwww"); } else if(words.get(i).getPolar()==-1) { opinionScore+=words.get(i).getWeight()*words.get(i).getPolar()*NEG_WEIGHT; // System.out.println(words.get(i).getWord()); } } } } //System.out.println("最后得分:"+opinionScore); return opinionScore; } //计算一般新闻的倾向性 public void opinion(Set<String> keyword,String text,String title) { posCount=0; negCount=0; System.out.println("opinion"); //计算title的倾向性 shortTextOpinion(keyword,title); Set<String> sentences = new HashSet(); String[] array=text.split(" "); //System.err.println(array.length); for(int i=0;i<array.length;i++) { sentences.add(array[i]); } Iterator KwordIter=keyword.iterator(); Iterator senIter=sentences.iterator(); while(KwordIter.hasNext()) { String kwordIt=KwordIter.next().toString(); while(senIter.hasNext()) { String senIt=senIter.next().toString(); // String kwordIt=KwordIter.next().toString(); if((senIt.indexOf(kwordIt))!=-1) { //单个句子倾向性得分 int value=sentenceScore(senIt); if(value>0) posCount++; else if(value<0) negCount +=NEG_WEIGHT; } } } } //计算短文本如微博的倾向性 public void shortTextOpinion(Set<String> keyword,String text) { System.out.println("shortTextOpinion"); posCount=0; negCount=0; int kwordP=0; int owordP=0; Iterator kwordIter=keyword.iterator(); while(kwordIter.hasNext()) { String kwordIt=kwordIter.next().toString(); kwordP=text.indexOf(kwordIt); //文本中存在关键词 if(kwordP!=-1) { int opinionScore=0; int pairPosition=0; StringBuffer wordPair=new StringBuffer(); for(int i=0;i<words.size();i++) { owordP=text.indexOf(words.get(i).getWord()); if(owordP!=-1) { //是否出现词对 int flag=0; for(int j=0;j<adjectives.size();j++) { wordPair.append(adjectives.get(j).getWord()); wordPair.append(words.get(i).getWord()); pairPosition=text.indexOf(wordPair.toString()); if(pairPosition!=-1) { flag=1; int tmpScore=words.get(i).getWeight()*adjectives.get(j).getWeight()*words.get(i).getPolar()*adjectives.get(j).getPolar(); if(tmpScore>0) opinionScore +=tmpScore; else opinionScore +=NEG_WEIGHT*tmpScore; } } if(flag==0) { if(words.get(i).getPolar()==1) opinionScore +=words.get(i).getWeight()*words.get(i).getPolar(); else if(words.get(i).getPolar()==-1) opinionScore +=NEG_WEIGHT*words.get(i).getWeight()*words.get(i).getPolar(); } } } if(opinionScore>0) posCount +=TITLE_WEIGHT; else if(opinionScore<0) negCount +=TITLE_WEIGHT*NEG_WEIGHT; } } } //media=3为微博采用短文本倾向性,第二个参数为空 public void analyse(int media,Set<String> keyword,String text,String title) { if(media ==3) { System.out.println("media=3"); shortTextOpinion(keyword,title); } else { System.out.println("media=1"); opinion(keyword,text,title); } } //最终倾向性 public int getPolar() { if(posCount>negCount) return 1; else if(negCount>posCount) return -1; else return 0; } public static void main(String[] args) throws SQLException { OpinionAnalyser a=new OpinionAnalyser(); a.OpinionAnalyser(); a.sentenceScore("好不好!"); String str="心情很好"; System.out.println("文本倾向性:"+a.sentenceScore(str)); //String text="兴业证券正面临着暴跌!需要采取一定的措施来进行抵御!"; //Set <String> keyword = new HashSet(); //keyword.add("兴业证券"); //keyword.add("金融危机"); //String title="兴业证券面临金融危机"; //a.analyse(1, keyword, text, title); //System.out.println("该文本最后倾向性:"+a.getPolar()); } }
标签:
原文地址:http://www.cnblogs.com/zeze/p/5331650.html