标签:pandas data pwd nta lin app title pre contain
# In[1]
import pandas as pd
import numpy as np
import json
import os
import re
# In[2]
# !pwd
os.chdir(‘./root/FAQ/‘)
# In[2]
with open(‘./data/all_data.txt‘, ‘r‘) as f:
data = f.read().split(‘[SEP]‘)
AQ = pd.DataFrame(data, columns=[‘question‘])
# In[3]
# 可以回答的问题,设置为正例
AQ[‘question‘] = AQ[‘question‘].str.strip()
AQ[‘label‘] = 2
# In[4]
# 体育-非东奥; 非体育类;
f = open(‘./data/Negative.json‘, ‘rb‘)
line = f.read().decode(‘utf8‘, ‘ignore‘)
f.close()
with open(‘./data/Negative.txt‘, ‘w‘) as f:
f.write(‘[‘ + ‘,‘.join(line.split()) + ‘]‘)
# In[5]
NoAQ = pd.read_json(‘./data/Negative.txt‘)
NoAQ[‘title‘]
# In[4]
# 体育-非东奥; 非体育类;
f = open(‘./data/Negative02.json‘, ‘rb‘)
line2 = f.read().decode(‘utf8‘, ‘ignore‘)
f.close()
with open(‘./data/Negative02.txt‘, ‘w‘) as f:
f.write(‘[‘ + ‘,‘.join(line2.split()) + ‘]‘)
# In[5]
NoAQ02 = pd.read_json(‘./data/Negative02.txt‘)
NoAQ02[‘title‘]
# In[5]
NoAQ = NoAQ.append(NoAQ02)
# In[6]
# 285155
# 285155
train_len = len(NoAQ)
print(‘东奥(可回答): ‘, len(AQ.iloc[:train_len]))
# 285155
print(‘体育-非东奥 + 非体育类:‘, train_len)
# In[7]
# 筛选体育-非东奥;非体育类;
# 改列名
NoAQ = NoAQ.drop(labels=[‘answer‘, ‘desc‘, ‘url‘], axis=1)
NoAQ.columns = [‘question‘]
NoAQ
# In[8]
# 体育类,非东奥类
sports = [‘跑步‘, ‘打球‘, ‘打‘, ‘强‘, ‘壮‘, ‘体育‘, ‘运动员‘, ‘运动‘, ‘活动‘, ‘训练‘, ‘得分‘, ‘比赛‘, ‘参赛‘, ‘赢‘, ‘球‘]
found = NoAQ[‘question‘].str.contains(‘|‘.join(sports))
# sports_idx = NoAQ[‘question‘][found].index
# print(len(sports_idx))
# NoAQ.iloc[sports_idx]
# In[9]
标签:pandas data pwd nta lin app title pre contain
原文地址:https://www.cnblogs.com/douzujun/p/14375660.html