码迷,mamicode.com
首页 > 其他好文 > 详细

ReadFiles

时间:2014-09-19 22:29:26      阅读:325      评论:0      收藏:0      [点我收藏+]

标签:style   blog   color   io   os   java   ar   for   文件   

import java.io.*;
import java.util.ArrayList;
import java.util.HashSet;

import java.util.regex.Pattern;  
import java.util.regex.Matcher;  

public class ReadFiles
{

    //返回:给定目录下文件的个数
    public static int GetFileNum(String pathName)
    {
        File file=new File(pathName);
        File[] nextFiles=file.listFiles();
        return nextFiles.length;
    }
    
    //返回:一个关于所有文件名的列表
    public static ArrayList<String> GetFileName(String pathName) throws IOException
    {
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        ArrayList<String> fileName=new ArrayList<String>();
        
        for(int i=0;i<hamFiles.length;i++)
        {
            fileName.add(hamFiles[i].getPath());
        }
        
        for(int i=0;i<spamFiles.length;i++)
        {
            fileName.add(spamFiles[i].getPath());
        }
        
        return fileName;
    }
    
    //返回:一个关于所有文件单词的列表
    public static ArrayList<String> GetWordsList(String pathName) throws IOException
    {
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        HashSet<String> set=new HashSet<String>();
        for(int i=0;i<hamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
           
                while(ma.find())
                {  
                    set.add(ma.group().toLowerCase());
                } 
                
            }
            in.close();
        }
        
        for(int i=0;i<spamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
           
                while(ma.find())
                {  
                    set.add(ma.group().toLowerCase());
                } 
                
            }
            in.close();
        }
        
        ArrayList<String> wordList=new ArrayList<String>(set);
        return wordList;
        
    }
    
    //返回:一个关于单词的处理而得的矩阵
    public static ArrayList<MyArray> GetMatrix(String pathName,ArrayList<String> wordList) throws IOException
    {
        ArrayList<MyArray> trainMatrix=new ArrayList<MyArray>();
        
        File fileHam=new File(pathName+"\\ham");
        File fileSpam=new File(pathName+"\\spam");
        
        File[] hamFiles=fileHam.listFiles();
        File[] spamFiles=fileSpam.listFiles();
        
        
        for(int i=0;i<hamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(hamFiles[i])));
            MyArray wordArray=new MyArray(wordList.size());
            wordArray.InitArray(0);
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
                
                while(ma.find()){  
                    int pos=wordList.indexOf(ma.group().toLowerCase());
                    if(pos!=-1)
                        wordArray.SetPos(pos);
                    
                } 
                
            }
            trainMatrix.add(wordArray);
            in.close();
        }
        
        
        for(int i=0;i<spamFiles.length;i++)
        {
            BufferedReader in=new BufferedReader(new InputStreamReader(new FileInputStream(spamFiles[i])));
            MyArray wordArray=new MyArray(wordList.size());
            wordArray.InitArray(0);
            String s=null;
            while((s=in.readLine())!=null)
            {
                
                String sMatch = "\\d+.\\d+|\\w+|\\$";
                Pattern  pattern=Pattern.compile(sMatch);  
                Matcher  ma=pattern.matcher(s);  
                
                while(ma.find()){  
                    int pos=wordList.indexOf(ma.group().toLowerCase());
                    if(pos!=-1)
                        wordArray.SetPos(pos);
                    
                } 
                
            }
            trainMatrix.add(wordArray);
            in.close();
        }
        
        return trainMatrix;
        
    }
}

 

ReadFiles

标签:style   blog   color   io   os   java   ar   for   文件   

原文地址:http://www.cnblogs.com/aniy/p/3982320.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!