-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_miner.py
287 lines (216 loc) · 9.49 KB
/
model_miner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# %% [markdown]
# # Mine jsons from HF and paginate. At the moment (may 2024) it's only 6 pages, causing neglectable traffic
# %%
import requests
import pandas as pd
from tqdm import tqdm
import panel as pn
import json
tqdm.pandas()
# Base URL for the API
base_url = "https://huggingface.co/models-json?other=feature-extraction&library=transformers.js&sort=trending&numItemsPerPage=50" # attention: https://huggingface.co/posts/do-me/362814004058611
# List to store all models
all_models = []
# Total number of pages to fetch
total_pages = 12 # adding more pages here for the future, should be raised once there are more than 300 models
# Use tqdm to show progress
for page_number in tqdm(range(0, total_pages + 1), desc="Fetching Pages"):
# Construct the full URL for the current page
url = f"{base_url}&p={page_number}"
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON data from the response
json_data = response.json()
# Extract the models from the current page's data
page_models = json_data['models']
# Append the models to the list
all_models.extend(page_models)
else:
print(f"Failed to fetch data from page {page_number}. Status code: {response.status_code}")
print(all_models.__len__(), "Models mined")
df = pd.DataFrame(all_models)
# %% [markdown]
# # For each of the models have a look at the onnx file sizes. Must request each page once unfortunately as it's not in the model's json
# Takes not more than 1.5 mins
# %%
from bs4 import BeautifulSoup
import requests
def extract_size_from_url(url):
try:
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# Find all 'a' tags with the specified title attribute
a_tags = soup.find_all('a', title="Download file")
model_sizes = [] # Initialize a list to store sizes of model files
for a_tag in a_tags:
file_name_tag = a_tag.find_previous_sibling('div').find('span')
if not file_name_tag: # Skip if there's no 'span' tag
continue
file_name = file_name_tag.text.strip()
if file_name.endswith(".onnx"):
if file_name.startswith("model"):
size = a_tag.text.strip().split("\n")[0]
model_sizes.append(size)
else: # only if there is no normal model
if file_name.startswith(("decoder", "encoder")):
size = a_tag.text.strip().split("\n")[0]
model_sizes.append(size)
if model_sizes:
return model_sizes
else:
return ""
else:
return ""#f"HTTP Status Code: {response.status_code}"
except requests.exceptions.RequestException as e:
return f"Request error: {e}"
# extract_size_from_url("https://huggingface.co/Xenova/instructor-large/tree/main/onnx") # test
def scrape_sizes(model):
sizes = extract_size_from_url(f"https://huggingface.co/{model}/tree/main/onnx")
sizes = [i.replace(" ","") for i in sizes]
sizes = " | ".join(sizes)
return sizes
# scrape_sizes("mixedbread-ai/mxbai-embed-large-v1")
# %%
df["sizes"] = df["id"].progress_apply(scrape_sizes) # 1 min 19s
# %% [markdown]
# ## Remove models that are not currently working (but have the transformers.js and feature-extraction tag)
# %%
print("Removing following models from dataset: \n",df[df["sizes"] == ""].id)
df = df[df["sizes"] != ""]
# %% [markdown]
# # Add min and max onnx file size for sorting. Must be converted from different units
# %%
import re
# Conversion dictionary
size_conversion = {'Byt': 1, 'Bytes': 1, 'kB': 1024, 'MB': 1024**2, 'GB': 1024**3}
# Conversion function
def size_to_bytes(size_str):
# Use regex to find the number and unit
match = re.search(r'(\d+(\.\d+)?)\s*(Byt|Bytes|kB|MB|GB)', size_str)
if not match:
raise ValueError(f"Invalid size format: {size_str}")
size_value = float(match.group(1))
size_unit = match.group(3)
return size_value * size_conversion[size_unit]
# Parsing and conversion function
def parse_and_find_min_max(sizes_str):
sizes_list = sizes_str.split(' | ')
sizes_bytes = [size_to_bytes(s) for s in sizes_list]
return min(sizes_bytes), max(sizes_bytes)
# Apply the function and create new columns
# Assuming 'df' is a pandas DataFrame and 'sizes' is a column in that DataFrame
df['min_size'], df['max_size'] = zip(*df['sizes'].apply(parse_and_find_min_max))
# %%
#df.sort_values("min_size", ascending=True).head(20) # sort as you please here
# %% [markdown]
# # Removing all sizes below 50kB for the moment to filter
# %%
df = df[df["min_size"] > 50000].reset_index(drop=True)
df = df.reset_index(drop=True)
df["trending"] = df.index +1 # adding the trending column
# %%
from datetime import datetime
# Get today's date
today = datetime.today().strftime("%d-%m-%Y")
df["mined_date"] = today # append to df so that one could easily concat dfs of different dates and do a groupby or similar, convenience
# %%
df.head(10)
# %% [markdown]
# # Save files
# %%
df.to_excel(f"data/feature-extraction/transformersjs_{today}.xlsx")
df.to_parquet(f"data/feature-extraction/transformersjs_{today}.parquet")
df.to_csv(f"data/feature-extraction/transformersjs_{today}.csv")
df.to_json(f"data/feature-extraction/transformersjs_{today}.json")
# %% [markdown]
# # To html options (ready to be pasted for SemanticFinder)
# %%
# Assuming df is your DataFrame
html_options = []
for index, row in df.iterrows():
# Extracting relevant information from each row
author = row['author']
downloads = row['downloads']
likes = row['likes']
sizes = row['sizes']
id = row['id']
# Creating the option string
option_str = f'<option value="{id}">{id} | 💾{sizes} 📥{downloads} ❤️{likes}</option>'
# Adding the option to the list
html_options.append(option_str)
# Joining all options into a single string
html_options_str = '\n'.join(html_options)
with open(f"data/feature-extraction/transformersjs_html_options_{today}.html", 'w') as file:
file.write(html_options_str)
# %% [markdown]
# # To html table with filters/sorting
# %%
# Define the editors for your columns
tabulator_editors = {
'float': {'type': 'number', 'max': 10, 'step': 0.1},
'bool': {'type': 'tickCross', 'tristate': True, 'indeterminateValue': None},
'str': {'type': 'list', 'valuesLookup': True},
}
# Create the Tabulator widget with header filters
header_filter_table = pn.widgets.Tabulator(
df, sizing_mode='stretch_width', layout='fit_columns',
editors=tabulator_editors, header_filters=True
)
# Save the widget to HTML with header filters
header_filter_table.save(f"data/feature-extraction/transformersjs_{today}.html")
header_filter_table.save(f"index.html")
#df.to_html(index=False) # pandas has not sorting/filtering option
# %% [markdown]
# # Send ntfy notifications
# %%
# Format the DataFrame into a list
list_message = f"Trending HuggingFace Embedding Models - {today}\n"
list_message += f"{df.__len__()} available for feature-extraction in transformers.js:\n\n"
for index, row in df.head(10).iterrows():
list_message += f"{index + 1}. {row['id']}, Likes: {row['likes']}, Downloads: {row['downloads']}\n Sizes: {row['sizes']}\n\n"
list_message += f"Meta data about all {df.__len__()} models can be downloaded on GitHub as csv, xlsx, json, parquet, html. Models can be downloaded from HuggingFace. Originally designed for SemanticFinder, a web app for in-browser semantic search where you can test all models without installing anything."
# %%
import requests
import json
from datetime import datetime
# Get the current date and weekday
current_date = datetime.now()
current_day_of_week = current_date.weekday()
current_day_of_month = current_date.day
# Define the base URL for the ntfy.sh server
base_url = "https://ntfy.sh/"
# Prepare the actions as a list of dictionaries
actions = [
{"action": "view", "label": "GitHub", "url": "/~https://github.com/do-me/trending-huggingface-models"},
{"action": "view", "label": "HuggingFace", "url": "https://huggingface.co/models?library=transformers.js&other=feature-extraction&sort=trending"},
{"action": "view", "label": "SemanticFinder", "url": "https://do-me.github.io/SemanticFinder/"}
]
# Define the channel names
channels = {
"daily": "feature_extraction_transformers_js_models_daily",
"weekly": "feature_extraction_transformers_js_models_weekly",
"monthly": "feature_extraction_transformers_js_models_monthly"
}
# Function to send notification
def send_notification(channel, message):
payload = {
"topic": channel,
"message": list_message,
"actions": actions
}
response = requests.post(base_url, json=payload)
print(f"Notification sent to {channel}. Status Code: {response.status_code}")
# Send daily notification
send_notification(channels["daily"], "Daily request message")
# Check if today is Monday (0 is Monday, 6 is Sunday) and send weekly notification
if current_day_of_week == 0:
send_notification(channels["weekly"], "Weekly request message")
# Check if today is the first of the month and send monthly notification
if current_day_of_month == 1:
send_notification(channels["monthly"], "Monthly request message")
# %%
print(df.head(10))
# %%