require:
/** * before: * file A1.csv {1,2,3,4,5} * file A2.csv {2,3,9,10,11} * file B1.csv {5,12,13,14,15} * file B2.csv {16,14,15,4,9,20,30} * A1.csv A2.csv A3.csv A4.csv cant not repeat * * after: * file A1.csv {1,4} * file A2.csv {2,3,10,11} * file B1.csv {12,13} * file B2.csv {16,9,20,30} */
tangxin@tangxin:~/csvrepeat$ ls A1.csv A2.csv B1.csv B2.csv
CSVUtilVersion2.java
import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.io.LineIterator; import org.springframework.util.StringUtils; import java.io.*; import java.lang.reflect.Array; import java.util.*; /** * before: * file A1.csv {1,2,3,4,5} * file A2.csv {2,3,9,10,11} * file B1.csv {5,12,13,14,15} * file B2.csv {16,14,15,4,9,20,30} * A1.csv A2.csv A3.csv A4.csv cant not repeat * * after: * file A1.csv {1,4} * file A2.csv {2,3,10,11} * file B1.csv {12,13} * file B2.csv {16,9,20,30} */ @Slf4j public class CSVUtilVersion2 { private static final String CSV_PATH = "/home/tangxin/csvrepeat/"; private static final boolean CREATE_SWITCH = true; /** * read single column data list * @param path * @return */ public static List<String> ids(String path) { List<String> result = new ArrayList<>(); File csv = new File(path); // CSV文件路径 LineIterator it = null; try { it = FileUtils.lineIterator(csv); while (it.hasNext()) { String line = it.nextLine(); if (line.trim().contains("ID")) { continue; } String[] arr = line.split(","); String ID = arr[0]; ID = ID.replaceAll("\"", "").trim(); if (!StringUtils.isEmpty(ID)) { result.add(ID); } } } catch (Exception e) { log.error("读取ID csv文件失败:{}", e.getMessage()); } finally { LineIterator.closeQuietly(it); } return result; } /** * from src delete oth * @param src * @param oth * @return */ public static List removeAll(List src, List oth) { LinkedList result = new LinkedList(src); HashSet othHash = new HashSet(oth); Iterator iter = result.iterator(); while (iter.hasNext()) { if (othHash.contains(iter.next())) { iter.remove(); } } return result; } /** * -Xms1g -Xmx1g -XX:PermSize=128m -XX:SurvivorRatio=2 -XX:+UseParallelGC * @param args * @throws Exception */ public static void main(String[] args) throws Exception { //∑=1+2+3+...+(n-1) group LinkedList<String> fileList = new LinkedList<>(); fileList.add("A1.csv"); fileList.add("A2.csv"); fileList.add("B1.csv"); fileList.add("B2.csv"); // fileList.add("C1.csv"); DescartesRepeat(fileList); ded(fileList); } private static void DescartesRepeat(LinkedList<String> fileList) { Set<String> repeatList = new HashSet<>(); Set<String> groupSet = new HashSet<>(); Set<String> goONList = new HashSet<>(); //A1->A2,B1,B2 for (int i = 0; i < fileList.size(); i++) { String itemI = fileList.get(i); for (int j = 0; j < fileList.size(); j++) { String itemJ = fileList.get(j); if (!itemI.equals(itemJ)) { String groupR1 = itemI + "->" + itemJ; String groupR2 = itemJ + "->" + itemI; if (groupSet.contains(groupR1) || groupSet.contains(groupR2)){ continue; } groupSet.add(groupR1); String repeatT = repeat(CSV_PATH + itemI, CSV_PATH + itemJ); if(!StringUtils.isEmpty(repeatT)){ repeatList.add(repeatT); //System.out.println(groupR1+"->"+repeatT); } } } } if (CollectionUtils.isNotEmpty(repeatList)) { // System.out.println(repeatList); for (String repeatItem : repeatList) { Iterator<String> iterator = fileList.iterator(); while (iterator.hasNext()) { String oldItem = iterator.next(); String oldS = oldItem.replace(".csv", "").replace("-new",""); String repeatS = repeatItem.replace(".csv","").replace("-new",""); if (repeatS.contains(oldS)) { iterator.remove(); goONList.add(repeatItem); } } } fileList.addAll(goONList); System.out.println(fileList); DescartesRepeat(fileList); } } public static void ded(List<String> args) { //保证指定csv列表每组都不能有重复数据 for (int i = 0; i < args.size(); i++) { // if(i>0){ // continue; // } String source = CSV_PATH + args.get(i); for (int j = 0; j < args.size(); j++) { if (i == j) { continue; } String target = CSV_PATH + args.get(j); intersection(source, target); } } } public static void intersection(String sourcePath, String targetPath) { List<String> ids1 = ids(sourcePath); List<String> ids2 = ids(targetPath); List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2); System.out.println(sourcePath + "和" + targetPath + "的重复数据大小" + inter.size()); } public static String repeat(String source, String target){ //cdd fund xyd List<String> ids1 = ids(source); List<String> ids2 = ids(target); // System.out.println(source + "集合大小" + ids1.size()); // System.out.println(target + "集合大小" + ids2.size()); List<String> inter = (List<String>) CollectionUtils.intersection(ids1, ids2); // System.out.println("去重数据大小:" + inter.size()); if (inter != null && inter.size() > 0) { if (ids1.size() > ids2.size()) { return repeatInner(source, ids1, inter); } else if (ids2.size() > ids1.size()) { return repeatInner(target, ids2, inter); } else { return repeatInner(source, ids1, inter); } } return ""; } private static String repeatInner(String source, List<String> ids, List<String> inter) { String newPath = source.replace(".csv", "-new.csv"); List<String> ids1new = removeAll(ids, inter); createCSV(ids1new, newPath); return newPath.replace(CSV_PATH,""); } /** * 创建CSV文件 */ public static void createCSV(List<String> list, String fileName) { if(!CREATE_SWITCH){ // System.out.println("创建csv开关关闭"); return; }else{ // System.out.println("创建csv开关开启"); } // 表格头 Object[] head = {"ID"}; List<Object> headList = Arrays.asList(head); //数据 List<List<Object>> dataList = new ArrayList<>(); List<Object> rowList = null; for (int i = 0; i < list.size(); i++) { rowList = new ArrayList<>(); rowList.add(list.get(i)); dataList.add(rowList); } File csvFile; BufferedWriter csvWtriter = null; try { csvFile = new File(fileName); File parent = csvFile.getParentFile(); if (parent != null && !parent.exists()) { parent.mkdirs(); } csvFile.createNewFile(); // GB2312使正确读取分隔符"," csvWtriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csvFile), "GB2312"), 1024); // 写入文件头部 writeRow(headList, csvWtriter); // 写入文件内容 for (List<Object> row : dataList) { writeRow(row, csvWtriter); } csvWtriter.flush(); } catch (Exception e) { e.printStackTrace(); } finally { try { csvWtriter.close(); } catch (IOException e) { e.printStackTrace(); } } } /** * 写一行数据 * * @param row 数据列表 * @param csvWriter * @throws IOException */ private static void writeRow(List<Object> row, BufferedWriter csvWriter) throws IOException { for (Object data : row) { StringBuffer sb = new StringBuffer(); String rowStr = sb.append("\"").append(data).append("\",").toString(); csvWriter.write(rowStr); } csvWriter.newLine(); } }
tangxin@tangxin:~/csvrepeat$ ls A1.csv A1-new.csv A1-new-new.csv A2.csv A2-new.csv B1.csv B2.csv B2-new.csv B2-new-new.csv