package com.cmcm.goods_classification; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class WordProcess { private static final String DATASOURCEPATH = "D://mallet_data//0DataSource//Watches_Child//Sports_Watches"; private static final String STOPWORDSPATH = "C://mallet-2.0.7//stoplists//en.txt"; public static final String RESULTPATH = "D://automotives//result.txt"; private static Map<String, Integer> dataHash = new HashMap<String, Integer>(); private static Set<String> stopWordsSet = new HashSet<String>(); public static void main(String[] args) throws Exception { loadStopWords(); FileProcess.readFolder(DATASOURCEPATH); List<Map.Entry<String, Integer>> dataList = hashSort(); FileProcess.writeFile(dataList); } public static void pruneText(String textPath) { String text = FileProcess.readFile(textPath).toLowerCase();// 将所有字母化为小写 text = text.replaceAll("^[a-zA-Z0-9‘]|\\s+|\t|\r", " "); // 将非字母字符、多个空格回车换行均化为一个空格 String words[] = text.split("\\s+");// 取出单词,并将单词存入数组中 getFrequency(words); } public static void getFrequency(String[] words) { for (int i = 0; i < words.length; i++) { String key = words[i]; // key对应单词 if ((dataHash.get(key) != null) && (!stopWordsSet.contains(key))) { int value = ((Integer) dataHash.get(key)).intValue(); // value对应单词出现的频率,单词已在map中存在则value+1 value++; dataHash.put(key, new Integer(value)); } else { dataHash.put(key, new Integer(1)); // 单词未在map中存在则value初始化为1 } } } public static List<Map.Entry<String, Integer>> hashSort() { List<Map.Entry<String, Integer>> list_Data = new ArrayList<Map.Entry<String, Integer>>(dataHash.entrySet()); Collections.sort(list_Data, new Comparator<Map.Entry<String, Integer>>() { public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { if (o2.getValue() != null && o1.getValue() != null && o2.getValue().compareTo(o1.getValue()) > 0) { return 1; } else { return -1; } } }); return list_Data; } public static void loadStopWords() { String stopWordsText = FileProcess.readFile(STOPWORDSPATH); // System.out.println(stopWordsText); String words[] = stopWordsText.split("\\s+|\\t|\\r|\\n");// 取出单词,并将单词存入数组中 System.out.println(words.length); for(String word : words){ stopWordsSet.add(word); } } }
?
package com.cmcm.goods_classification; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import java.util.LinkedList; import java.util.List; import java.util.Map; public class FileProcess { /** * read all file in folder * @param path */ public static void readFolder(String path) { int fileNum = 0, folderNum = 0; File file = new File(path); if (file.exists()) { LinkedList<File> list = new LinkedList<File>(); File[] files = file.listFiles(); for (File file2 : files) { if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath()); list.add(file2); folderNum++; } else { System.out.println("FILE: " + file2.getAbsolutePath()); WordProcess.pruneText(file2.getAbsolutePath()); fileNum++; } } File temp_file; while (!list.isEmpty()) { temp_file = list.removeFirst(); files = temp_file.listFiles(); for (File file2 : files) { if (file2.isDirectory()) {//System.out.println("DIR : " + file2.getAbsolutePath()); list.add(file2); folderNum++; } else { System.out.println("FILE: " + file2.getAbsolutePath()); fileNum++; WordProcess.pruneText(file2.getAbsolutePath()); } } } } else { System.out.println("File is not exist!"); } System.out.println(" num dir is: " + folderNum + "\n num file is: "+ fileNum); } /** * read content from filePath and return content * @param filePath */ public static String readFile(String filePath) { File file = new File(filePath); StringBuffer result = new StringBuffer(); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = null; while ((tempString = reader.readLine()) != null) { result.append(" "); result.append(tempString); } reader.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } return result.toString(); } /** * write content into filePath * @param dataList */ public static void writeFile(List<Map.Entry<String, Integer>> dataList) { System.out.println("start write word and frequency"); int size = dataList.size(); File file = null; FileWriter fileWrite = null; PrintWriter pw = null; int count = 1; try { // if file exist ,append ; if not, create file = new File(WordProcess.RESULTPATH); fileWrite = new FileWriter(file, true); pw = new PrintWriter(fileWrite); for (int i = 0; i < size; i++) { String word = dataList.get(i).getKey(); int frequency = dataList.get(i).getValue(); // System.out.println(word + " : " + frequency); pw.print(word); pw.print(" "); pw.print(count++); pw.print(" "); pw.print(frequency); pw.println(); } pw.flush(); fileWrite.flush(); } catch(IOException e) { e.printStackTrace(); }finally{ try { pw.close(); fileWrite.close(); } catch (IOException e) { e.printStackTrace(); } System.out.println("end write word and frequency"); } } }
?
原文:http://java--hhf.iteye.com/blog/2174712