大数据公益大学提供的一份数据,义务处理一下,原始数据是Excel,含有html标签,如下:
要求清洗掉html标签,和微博内容中的url地址。
主要分为两部分:
1.处理文本,清洗数据。
2.处理excel读写操作。
上代码:
ExcelUtil类,包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下:
- package dat.datadeal;
-
- import java.io.File;
- import java.io.FileInputStream;
- import java.io.FileNotFoundException;
- import java.io.FileOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.text.ParseException;
- import java.text.SimpleDateFormat;
- import java.util.ArrayList;
- import java.util.Date;
- import java.util.List;
- import java.util.Locale;
- import java.util.logging.Level;
- import java.util.logging.Logger;
-
- import org.apache.poi.hssf.usermodel.HSSFCell;
- import org.apache.poi.hssf.usermodel.HSSFCellStyle;
- import org.apache.poi.hssf.usermodel.HSSFRow;
- import org.apache.poi.hssf.usermodel.HSSFSheet;
- import org.apache.poi.hssf.usermodel.HSSFWorkbook;
- import org.apache.poi.ss.usermodel.Cell;
- import org.apache.poi.ss.usermodel.DateUtil;
- import org.apache.poi.ss.usermodel.Row;
- import org.apache.poi.ss.usermodel.Sheet;
- import org.apache.poi.ss.usermodel.Workbook;
- import org.apache.poi.xssf.usermodel.XSSFWorkbook;
-
- public class ExcelUtil{
-
-
- public List<String[]> readExcel(String filePath) {
- List<String[]> dataList = new ArrayList<String[]>();
- boolean isExcel2003 = true;
- if (isExcel2007(filePath)) {
- isExcel2003 = false;
- }
- File file = new File(filePath);
- InputStream is = null;
- try {
- is = new FileInputStream(file);
- } catch (FileNotFoundException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Workbook wb = null;
- try {
- wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);
- } catch (IOException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- Sheet sheet = wb.getSheetAt(0);
- int totalRows = sheet.getPhysicalNumberOfRows();
- int totalCells = 0;
- if (totalRows >= 1 && sheet.getRow(0) != null) {
- totalCells = sheet.getRow(0).getPhysicalNumberOfCells();
- }
- for (int r = 0; r < totalRows; r++) {
- Row row = sheet.getRow(r);
- if (row == null) {
- continue;
- }
- String[] rowList = new String[totalCells];
- for (int c = 0; c < totalCells; c++) {
- Cell cell = row.getCell(c);
- String cellValue = "";
- if (cell == null) {
- rowList[c] = (cellValue);
- continue;
- }
- cellValue = ConvertCellStr(cell, cellValue);
- rowList[c] = (cellValue);
- }
- dataList.add(rowList);
- }
- return dataList;
- }
-
-
- private String ConvertCellStr(Cell cell, String cellStr) {
- switch (cell.getCellType()) {
- case Cell.CELL_TYPE_STRING:
-
- cellStr = cell.getStringCellValue().toString();
- break;
- case Cell.CELL_TYPE_BOOLEAN:
-
- cellStr = String.valueOf(cell.getBooleanCellValue());
- break;
- case Cell.CELL_TYPE_NUMERIC:
-
- if (DateUtil.isCellDateFormatted(cell)) {
-
- cellStr = formatTime(cell.getDateCellValue().toString());
- } else {
-
- cellStr = String.valueOf(cell.getNumericCellValue());
- }
- break;
- case Cell.CELL_TYPE_FORMULA:
-
- cellStr = cell.getCellFormula().toString();
- break;
- }
- return cellStr;
- }
-
-
-
- private boolean isExcel2007(String fileName) {
- return fileName.matches("^.+\\.(?i)(xlsx)$");
- }
-
- private String formatTime(String s) {
- SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);
- Date date = null;
- try {
- date = sf.parse(s);
- } catch (ParseException ex) {
- Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);
- }
- SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
- String result = sdf.format(date);
- return result;
- }
-
-
-
- public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
- HSSFWorkbook wb = new HSSFWorkbook();
- HSSFSheet sheet = wb.createSheet("sheet");
-
- HSSFCellStyle style = wb.createCellStyle();
- style.setAlignment(HSSFCellStyle.ALIGN_CENTER);
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- for (int i = 0; i < dataList.size(); i++) {
- HSSFRow row = sheet.createRow(i);
- List<String> list = dataList.get(i);
- for (int j = 0; j < list.size(); j++) {
- HSSFCell cell = row.createCell(j);
- cell.setCellValue(list.get(j));
- cell.setCellStyle(style);
- }
- }
-
- FileOutputStream fout = new FileOutputStream(filePath);
- wb.write(fout);
- fout.close();
- }
-
- }
DataClean类,包含对html标签,信息中url的的清洗。
- package dat.datadeal;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
-
- public class DataClean {
-
-
- public static String delHtml(String inputString) {
- String htmlStr = inputString;
- String textStr = "";
- java.util.regex.Pattern p_script;
- java.util.regex.Matcher m_script;
- java.util.regex.Pattern p_html;
- java.util.regex.Matcher m_html;
- try {
- String regEx_html = "<[^>]+>";
- String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>
- p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
- m_script = p_script.matcher(htmlStr);
- htmlStr = m_script.replaceAll("");
- p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
- m_html = p_html.matcher(htmlStr);
- htmlStr = m_html.replaceAll("");
- textStr = htmlStr;
- } catch (Exception e) {
- System.err.println("Html2Text: " + e.getMessage());
- }
- return textStr;
- }
-
-
- public static String dealWithUrl(String str){
- String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
- Pattern p = Pattern.compile(regEx);
- Matcher m = p.matcher(str);
- return m.replaceAll("");
- }
-
-
- public static void main(String[] args) throws IOException{
- ExcelUtil excelUtil = new ExcelUtil();
- List<List<String>> writeList = new ArrayList<List<String>>();
- List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");
- for(String[] lineArray:readList){
- List<String> strList = new ArrayList<String>();
- for(String str:lineArray){
- String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
- strList.add(strTmp);
-
- }
- writeList.add(strList);
- }
-
- excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);
- System.out.println("job has finished...........");
- }
- }
清洗后数据: