标签:统计 code 一个 进入 proc blank pytho 双语 close
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 28 18:42:33 2017
@author: lmt
"""
import re
import numpy as np
‘‘‘
该程序实现对giza++后的对齐双语平行语料抽取对齐词汇关系
建立源语言到目标语言的映射矩阵,编号从0开始,将对齐文件中的NULL当作第一个词语
如果词语之间存在对齐关系,则将对齐矩阵matrixST[s][t]位置值设置为1,其它为0
‘‘‘
def alig_pairs(filepath):
matrixZeroOne = []
pattern1 = re.compile(r‘ \(\{([0-9 ]*)\}\) ?‘)
# print(pattern1)
f = open(filepath,‘r‘)#,encoding=‘utf-8‘)
line=f.readline()
#matrix = np.zeros()
while(True):
if not line:
break
target = f.readline().strip().split()
source = f.readline().strip()
#match= pattern1.findall(source) # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
source_word = pattern1.split(source)
# print(source_word)
s_l = len(source_word)//2-1#-1不考虑null
t_l = len(target)
#print(s_l)
#print(t_l)
matrixTS = np.zeros((t_l,s_l))
#print(matrixST.shape)
#从null开始对齐i=0。如果不考虑null,从第二位开始,i=2
i=2
while( i < len(source_word)-2):
index = source_word[i+1]
if index != ‘‘ and index !=‘ ‘:
s = index.strip().split()
# print(s)
for s_ind in s:
#设置对齐矩阵
matrixTS[int(s_ind)-1][int((i-2))//2]=1
#print(i//2-1)
#该语句抽取对齐词语队
#print(source_word[int(i)],target[int(s_ind)-1])
i+=2
# print(matrixTS)
matrixZeroOne.append(matrixTS)
# print(matrixTS.shape)
#因为对齐这个矩阵是动态生成的,所以在这里进行矩阵的合并
#print(i)
#print(source_word)
#if match:
#print (match)
#print (‘yes‘)
line=f.readline()
#print(target)
#print(source)
f.close()
return matrixZeroOne
#alig_pairs(‘test.txt‘)
#alig_pairs(‘117-06-28.183340.lmt.A3.final‘)
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
# -*- coding: utf-8 -*-
import codecs
def get_matrix():
#print(‘程序进入process‘)
Chinese = codecs.open("result/result_cn",‘r‘,encoding = ‘utf-8‘)
English = codecs.open(‘result/result_en‘, ‘r‘, encoding = ‘utf-8‘)
# result_eng = codecs.open(‘result/swap_en‘, ‘w‘, encoding = ‘utf-8‘)
# result_chi = codecs.open(‘result/swap_cn‘, ‘w‘, encoding = ‘utf-8‘)
# eng_chi = codecs.open(‘result/en_to_cn‘,‘w‘,encoding = ‘utf-8‘)
english_sentence_count = 0
chinese_sentence_count = 0
chinese_word = []
chinese_sentence = []
for line in Chinese.readlines():
pair = line.strip().split()
if len(pair) == 4:
swap = pair[1]
pair[1] = pair[2]
pair[2] = swap
s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
chinese_word.append(s)
# result_chi.write(pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] + "\n")
if len(pair) == 0:
chinese_sentence.append(chinese_word)
chinese_word = []
# result_chi.write("\n")
chinese_sentence_count += 1
english_word = []
english_sentence = []
for line in English.readlines():
pair = line.strip().split()
if len(pair) == 4:
swap = pair[1]
pair[1] = pair[2]
pair[2] = swap
s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3]
english_word.append(s)
# result_eng.write(pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] + "\n")
if len(pair) == 0:
english_sentence.append(english_word)
english_word = []
# result_eng.write("\n")
english_sentence_count += 1
if english_sentence_count < chinese_sentence_count:
min_count = english_sentence_count
else:
min_count = chinese_sentence_count
matrix = []
if len(english_sentence) == len(chinese_sentence):
i = 0
while i < len(english_sentence):
chinese_sentence_length = len(chinese_sentence[i])
english_sentence_length = len(english_sentence[i])#获得当前句子的行列值
english_chinese = [["0" for col in range(english_sentence_length + 1)] for row in range(chinese_sentence_length + 1)]
col = 1
while col <= english_sentence_length:
english_chinese[0][col] = english_sentence[i][col - 1]
col += 1
row = 1
while row <= chinese_sentence_length:
english_chinese[row][0] = chinese_sentence[i][row - 1]
row += 1
# for row in range(chinese_sentence_length):
# for col in range(english_sentence_length):
# eng_chi.write(english_chinese[row][col] + " ")
# eng_chi.write("\n")
# eng_chi.write("\n")
#每次放进去的矩阵,其实规模是不一样大的
matrix.append(english_chinese)
i = i + 1
else:
print(‘error‘)
# for j in range(len(matrix)):
# for row in range(len(matrix[j])):
# s = ""
# for col in range(len(matrix[j][row])):
# s += matrix[j][row][col]
# s += " "
# print(s)
return matrix, chinese_sentence
#matrix,_ = get_matrix()
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------
#-*-coding:utf-8-*-
import os
import string
def count(filepath):
total = 0 #总行数
countPound = 0 #注释行数
countBlank = 0 #空行数
line = open(filepath,‘r‘)#,encoding=‘utf-8‘)
for li in line.readlines(): #readlines()一次性读完整个文件
total += 1
if not li.split(): #判断是否为空行
countBlank +=1
li.strip()
if li.startswith(‘#‘):
countPound += 1
print(file)
print("countBlank:%d" % countBlank)
print("countPound:%d" % countPound)
print("total:%d" % total)
count(‘result_cn‘)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#-*-coding:utf-8-*-
def bijiao():
f1=open(‘lmt.txt‘,‘r‘)
f2=open(‘lh.txt‘,‘r‘)
count=0 #统计行数
dif=[] #统计不同的数量序列
for a in f1:
b=f2.readline()
count+=1
if a!=b:
dif.append(count)
f1.close()
f2.close()
return dif
c=bijiao()
if c==0:
print(‘两个文件一样!‘)
else:
print(‘有%d处不同‘% len(c))
for each in d:
print(‘%d行不一样‘% each)
标签:统计 code 一个 进入 proc blank pytho 双语 close
原文地址:http://www.cnblogs.com/maowuyu-xb/p/7236769.html