标签:
#!/usr/bin/env python
import os
import sys
import os.path
def find_import(line):
line=line.strip()
IMPORT_CMD="import "
if not line.startswith(IMPORT_CMD):
return None
line=line[len(IMPORT_CMD):].strip()
line=line.strip(‘;‘)
parts=line.split("{")
if len(parts)==1:
return parts
head=parts[0]
parts=parts[1].strip("}")
parts=parts.split(",")
parts=["%s%s"%(head, part.strip()) for part in parts]
return parts
def import2path(roots, import_name):
spath = import_name.replace(‘.‘, ‘/‘)
for root in roots:
fpath=os.path.join(root, "%s.java"%spath)
if os.path.isfile(fpath):
return fpath
fpath=os.path.join(root, "%s.scala"%spath)
if os.path.isfile(fpath):
return fpath
return None
def file_info(fpath):
f=open(fpath, "r")
lines=f.readlines()
f.close()
lines=[line.strip() for line in lines if line.strip()!=""]
imports=[]
for line in lines:
import_array = find_import(line)
if import_array != None:
imports.extend(import_array)
return ( len(lines) - len(imports), imports)
def collect_file_info(collected, roots, entry_name):
if entry_name in collected:
return
fpath=import2path(roots, entry_name)
if fpath==None:
collected[entry_name]=None
return
if fpath in collected:
return
info=file_info(fpath)
collected[fpath]=info[0]
for import_name in info[1]:
collect_file_info(collected, roots, import_name)
def collect_ref_info(roots, entry_names):
collect_info={}
for entry_name in entry_names:
collect_file_info(collect_info, roots, entry_name)
return collect_info
def show_files_with_lines(files, title):
print("=============== %s ================="%title)
lines_total=0
files_total=0
for f in files:
lines_total=f[1]+lines_total
files_total=files_total+1
print("%s:%s"%(f[0], f[1]))
print("=============== total lines = %d,total files = %d ================="%(lines_total,files_total))
def show_files(files, title):
print("=============== %s ================="%title)
for f in files:
print(f)
if __name__== "__main__":
roots=open(sys.argv[1]).readlines()
roots=[root.strip() for root in roots if root.strip()!=""]
entry_names=open(sys.argv[2]).readlines()
entry_names=[entry_name.strip() for entry_name in entry_names if entry_name.strip()!=""]
ref_info = collect_ref_info(roots, entry_names)
in_files=[item for item in ref_info.items() if item[1]!=None]
out_files=[item[0] for item in ref_info.items() if item[1]==None]
spark_not_found=[f for f in out_files if f.startswith("org.apache.spark.")]
spark_not_found.sort()
hadoop_files=[f for f in out_files if f.startswith("org.apache.hadoop.")]
hadoop_files.sort()
other_files=list(set(out_files) - set(spark_not_found) - set(hadoop_files))
other_files.sort()
show_files_with_lines(in_files, "spark source")
show_files(spark_not_found, "spark import name not file name")
show_files(hadoop_files, "hadoop ref")
show_files(other_files, "others ref")
标签:
原文地址:http://www.cnblogs.com/haochen2016/p/5529675.html