标签:search sorted name head content paper dex list() cal
1 # %% NIPS 2020 论文信息下载 2 import json 3 import os 4 import re 5 6 import pandas as pd 7 import requests 8 import tqdm 9 from bs4 import BeautifulSoup 10 11 12 os.chdir(os.path.dirname(os.path.abspath(__file__))) 13 14 # %% 15 PAPER_HASH_PATTERN = re.compile(r‘poster_(?P<UID>\w+)\.html‘) 16 SESSION_PATTERN = re.compile(r‘Orals & Spotlights Track \d+:\s*(?P<session>[^;]*)‘) 17 18 19 def cleanup_string(s): 20 s = s.strip() 21 while ‘ ‘ in s: 22 s = s.replace(‘ ‘, ‘ ‘) 23 return s 24 25 26 def download_file(download_url, file_name=None): 27 if file_name is None: 28 file_name = os.path.basename(download_url) 29 response = requests.get(download_url, stream=True) 30 total = int(response.headers.get(‘Content-Length‘)) 31 pbar = None 32 if total is not None: 33 pbar = tqdm.tqdm(desc=f‘Downloading from {download_url} to {file_name}‘, 34 total=total, unit=‘B‘, unit_scale=True, unit_divisor=1000) 35 with open(file_name, ‘wb‘) as file: 36 for chunk in response.iter_content(chunk_size=10240): 37 if chunk: 38 file.write(chunk) 39 if pbar is not None: 40 pbar.update(len(chunk)) 41 42 43 # %% 44 # download paper list 45 if not os.path.exists(‘papers.json‘): 46 download_file(‘https://neurips.cc/virtual/2020/public/papers.json‘, file_name=‘papers.json‘) 47 48 # %% 49 # get oral paper list 50 oral_papers = set() 51 response = requests.get(‘https://neurips.cc/virtual/2020/public/f_orals.html‘) 52 soup = BeautifulSoup(response.text, ‘html.parser‘) 53 for tag in soup.find_all(‘a‘, href=PAPER_HASH_PATTERN): 54 href = tag[‘href‘] 55 UID = PAPER_HASH_PATTERN.search(href).group(‘UID‘) 56 oral_papers.add(UID) 57 58 # %% 59 # process paper list 60 with open(‘papers.json‘, mode=‘r‘) as file: 61 data = json.load(file) 62 63 df = pd.DataFrame(columns=[‘ID‘, ‘Category‘, ‘Title‘, ‘Authors‘, ‘Keywords‘, ‘Sessions‘, ‘URL‘, ‘Proceedings URL‘, ‘PDF URL‘, ‘UID‘]) 64 for i, paper in enumerate(tqdm.tqdm(data)): 65 if paper[‘eventtype‘] != ‘Poster‘: 66 continue 67 68 UID = paper[‘UID‘] 69 category = ‘Poster‘ 70 sessions = ‘; ‘.join(paper[‘sessions‘]) 71 sessions = ‘; ‘.join([match.group(‘session‘) for match in SESSION_PATTERN.finditer(sessions)]) 72 sessions = cleanup_string(sessions) 73 if sessions != ‘‘: 74 category = ‘Spotlight‘ 75 if UID in oral_papers: 76 category = ‘Oral‘ 77 78 keywords = set() 79 for keyword in (‘; ‘.join(paper[‘keywords‘])).split(‘; ‘): 80 keyword = cleanup_string(keyword) 81 if keyword != ‘‘: 82 keywords.add(keyword) 83 keywords = ‘\n‘.join(sorted(keywords)) 84 85 paper = { 86 ‘ID‘: paper[‘id‘], 87 ‘Category‘: category, 88 ‘Title‘: cleanup_string(paper[‘title‘]), 89 ‘Authors‘: cleanup_string(‘, ‘.join(paper[‘authors‘])), 90 ‘Keywords‘: keywords, 91 ‘Sessions‘: sessions, 92 ‘URL‘: f‘https://neurips.cc/virtual/2020/public/poster_{UID}.html‘, 93 ‘Proceedings URL‘: paper[‘paper_pdf_url‘], 94 ‘PDF URL‘: f‘https://proceedings.neurips.cc/paper/2020/file/{UID}-Paper.pdf‘, 95 ‘UID‘: UID 96 } 97 df.loc[len(df)] = paper 98 99 df[‘Category‘] = pd.Categorical(df[‘Category‘], categories=[‘Oral‘, ‘Spotlight‘, ‘Poster‘]) 100 df.sort_values(by=[‘Category‘, ‘Sessions‘, ‘Keywords‘], inplace=True) 101 df.to_csv(‘paper_list.csv‘, index=False) 102 103 # %% 104 # get paper details 105 all_subject_areas = set() 106 for i, paper in enumerate(tqdm.tqdm(df.iloc, total=len(df))): 107 if paper[‘Keywords‘] == ‘‘: 108 continue 109 areas = set(paper[‘Keywords‘].split(‘\n‘)) 110 all_subject_areas.update(areas) 111 112 try: 113 all_subject_areas.remove(‘‘) 114 except KeyError: 115 pass 116 117 df = df.reindex(columns=df.columns.to_list() + sorted(all_subject_areas)) 118 for i, paper in enumerate(df.iloc): 119 for area in paper[‘Keywords‘].split(‘\n‘): 120 if area != ‘‘: 121 df[area][i] = ‘Y‘ 122 123 df.to_csv(‘NeuraIPS Papers.csv‘, index=False)
标签:search sorted name head content paper dex list() cal
原文地址:https://www.cnblogs.com/imoon22/p/14255581.html