-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiclr_company_count.py
107 lines (83 loc) · 3.67 KB
/
iclr_company_count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from bs4 import BeautifulSoup
import urllib.request
from lxml import html, etree
import requests, operator, sys, pdb
class iclr_company_count_puller(object):
def __init__(self):
self.companies = ['nvidia.com', 'google.com', 'fb.com', 'microsoft.com', \
'intel.com', 'openai.com', 'ibm.com', 'baidu.com', 'adobe.com', \
'samsung.com', 'nec-labs.com', 'salesforce.com']
self.company_counts = dict(zip(self.companies, [0]*len(self.companies)))
def count_companies(self, conflicts):
for company in self.companies:
if company in conflicts:
self.company_counts[company] += 1
break
# helper function for ICLR 2017 papers
def _iclr_2017(self, mirror_url, accepted_titles):
dl_count = cnn_count = rnn_count = gan_count = ae_count = rl_count = 0
HtmlFile = open(mirror_url, 'r', \
encoding='utf-8')
resp = HtmlFile.read()
soup = BeautifulSoup(resp)
links = []
for link in soup.find_all('a', href=True):
hlink = link['href']
hlink_list = hlink.split('/')
if hlink_list[3][:5] == 'forum':
links.append(hlink)
for link in links:
url = link
page = requests.get(url)
tree = html.fromstring(page.content)
title = tree.xpath('//h2[contains(@class, "note_content_title citation_title")]/text()')
if title[0].strip() in accepted_titles:
content_field = tree.xpath('//strong[contains(@class, "note-content-field")]/text()')
content_text = tree.xpath('//span[contains(@class, "note-content-value")]/text()')
_authorids = content_text[content_field.index('Authorids:')]
authorids = [aa.strip() for aa in \
[a.split('@')[1] for a in _authorids.split(',')]]
company_counted = []
for company in self.companies:
if (company in authorids) and (company not in company_counted):
self.company_counts[company] += 1
company_counted.append(company)
def iclr(self, track):
if track == 'conference':
conference_track_accepted = []
# gathered from http://www.iclr.cc/doku.php?id=iclr2017:conference_posters
with open('iclr2017_mirror/iclr2017_conference_track_accepted.txt') as f:
conference_track_accepted_ = f.readlines()
for title in conference_track_accepted_:
conference_track_accepted.append(title.strip())
# mirror of https://openreview.net/group?id=ICLR.cc/2017/conference
# - downloaded the .htm file to avoid having to deal with Ajax query
conf_track_mirror = 'iclr2017_mirror/ICLR2017_conference_OpenReview.htm'
self._iclr_2017(conf_track_mirror, conference_track_accepted)
elif track == 'workshop':
workshop_track_accepted = []
# gathered from http://www.iclr.cc/doku.php?id=iclr2017:workshop_posters
with open('iclr2017_mirror/iclr2017_workshop_track_accepted.txt') as f:
workshop_track_accepted_ = f.readlines()
for title in workshop_track_accepted_:
workshop_track_accepted.append(title.strip())
# mirror of https://openreview.net/group?id=ICLR.cc/2017/workshop
# - downloaded the .html file to avoid having to deal with Ajax query
work_track_mirror = 'iclr2017_mirror/ICLR2017_workshop_OpenReview.htm'
self._iclr_2017(work_track_mirror, workshop_track_accepted)
else:
sys.exit('cannot reconize the track name!')
return self.company_counts
if __name__ == "__main__":
puller = iclr_company_count_puller()
print('conference track only:')
counts = puller.iclr('conference')
counts_sorted = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
for count in counts_sorted:
print(count[0], ':', count[1])
print()
print('conference + workshop tracks')
counts = puller.iclr('workshop')
counts_sorted = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
for count in counts_sorted:
print(count[0], ':', count[1])