Skip to content

Commit

Permalink
Refactor accuracy table writer script
Browse files Browse the repository at this point in the history
  • Loading branch information
pemistahl committed Sep 13, 2024
1 parent 336f14b commit ef426d7
Showing 1 changed file with 68 additions and 127 deletions.
195 changes: 68 additions & 127 deletions scripts/accuracy_table_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,114 +20,48 @@


class AccuracyTableWriter:
_columns = (
"average-lingua-high",
"average-lingua-low",
"average-langdetect",
"average-fasttext",
"average-fastspell-cons",
"average-fastspell-aggr",
"average-langid",
"average-cld3",
"average-cld2",
"average-simplemma",
"single-words-lingua-high",
"single-words-lingua-low",
"single-words-langdetect",
"single-words-fasttext",
"single-words-fastspell-cons",
"single-words-fastspell-aggr",
"single-words-langid",
"single-words-cld3",
"single-words-cld2",
"single-words-simplemma",
"word-pairs-lingua-high",
"word-pairs-lingua-low",
"word-pairs-langdetect",
"word-pairs-fasttext",
"word-pairs-fastspell-cons",
"word-pairs-fastspell-aggr",
"word-pairs-langid",
"word-pairs-cld3",
"word-pairs-cld2",
"word-pairs-simplemma",
"sentences-lingua-high",
"sentences-lingua-low",
"sentences-langdetect",
"sentences-fasttext",
"sentences-fastspell-cons",
"sentences-fastspell-aggr",
"sentences-langid",
"sentences-cld3",
"sentences-cld2",
"sentences-simplemma",
)
_table = """<table>
_column_labels = {
"cld2": "&nbsp;&nbsp;CLD2&nbsp;&nbsp;",
"cld3": "&nbsp;&nbsp;CLD3&nbsp;&nbsp;",
"fastspell-aggr": "FastSpell<br>(aggressive mode)",
"fastspell-cons": "FastSpell<br>(conservative mode)",
"fasttext": "FastText",
"langdetect": "Langdetect",
"langid": "Langid",
"lingua-low-accuracy": "Lingua<br>(low accuracy mode)",
"lingua-high-accuracy": "Lingua<br>(high accuracy mode)",
"simplemma": "Simplemma",
}

def __init__(self, table_title: str, report_file_path: Path):
self._table_title = table_title
self._dataframe = self._read_into_dataframe(report_file_path)

def write_accuracy_table(self, file_path: Path):
mean = self._dataframe.mean().round()
median = self._dataframe.median().round(2)
std = self._dataframe.std().round(2)

colspan = len(self._column_labels)
table = f"""<table>
<tr>
<th>Language</th>
<th colspan="10">Average</th>
<th colspan="10">Single Words</th>
<th colspan="10">Word Pairs</th>
<th colspan="10">Sentences</th>
<th colspan="{colspan}">{self._table_title}</th>
</tr>
<tr>
<th></th>
<th>Lingua<br>(high accuracy mode)</th>
<th>Lingua<br>(low accuracy mode)</th>
<th>Langdetect</th>
<th>FastText</th>
<th>FastSpell<br>(conservative mode)</th>
<th>FastSpell<br>(aggressive mode)</th>
<th>Langid</th>
<th>&nbsp;&nbsp;CLD3&nbsp;&nbsp;</th>
<th>&nbsp;&nbsp;CLD2&nbsp;&nbsp;</th>
<th>Simplemma</th>
<th>Lingua<br>(high accuracy mode)</th>
<th>Lingua<br>(low accuracy mode)</th>
<th>Langdetect</th>
<th>FastText</th>
<th>FastSpell<br>(conservative mode)</th>
<th>FastSpell<br>(aggressive mode)</th>
<th>Langid</th>
<th>&nbsp;&nbsp;CLD3&nbsp;&nbsp;</th>
<th>&nbsp;&nbsp;CLD2&nbsp;&nbsp;</th>
<th>Simplemma</th>
<th>Lingua<br>(high accuracy mode)</th>
<th>Lingua<br>(low accuracy mode)</th>
<th>Langdetect</th>
<th>FastText</th>
<th>FastSpell<br>(conservative mode)</th>
<th>FastSpell<br>(aggressive mode)</th>
<th>Langid</th>
<th>&nbsp;&nbsp;CLD3&nbsp;&nbsp;</th>
<th>&nbsp;&nbsp;CLD2&nbsp;&nbsp;</th>
<th>Simplemma</th>
<th>Lingua<br>(high accuracy mode)</th>
<th>Lingua<br>(low accuracy mode)</th>
<th>Langdetect</th>
<th>FastText</th>
<th>FastSpell<br>(conservative mode)</th>
<th>FastSpell<br>(aggressive mode)</th>
<th>Langid</th>
<th>&nbsp;&nbsp;CLD3&nbsp;&nbsp;</th>
<th>&nbsp;&nbsp;CLD2&nbsp;&nbsp;</th>
<th>Simplemma</th>
</tr>
"""

def __init__(self, file_path):
self._dataframe = self._read_into_dataframe(file_path)
for column_label in self._column_labels.values():
table += f" <th>{column_label}</th>\n "

def write_accuracy_table(self, file_name):
mean = self._dataframe.mean().round().astype(int)
median = self._dataframe.median().round(2)
std = self._dataframe.std().round(2)
table += "</tr>\n <tr>\n"

for language in self._dataframe.index:
language_data = self._dataframe.loc[language]
self._table += f"\t<tr>\n\t\t<td>{language}</td>\n"
table += f" <td>{language}</td>\n"

for column in self._columns:
for column in self._column_labels.keys():
accuracy_value = language_data.loc[[column]].iloc[0]
if not math.isnan(accuracy_value):
accuracy_value = int(round(accuracy_value))
Expand All @@ -136,46 +70,43 @@ def write_accuracy_table(self, file_name):
accuracy_str = "-"

color = self._get_square_color(accuracy_value)
self._table += f'\t\t<td><img src="https://raw.githubusercontent.com/pemistahl/lingua-py/pure-python-impl/images/{color}.png"> {accuracy_str}</td>\n'
table += f' <td><img src="https://raw.githubusercontent.com/pemistahl/lingua-py/pure-python-impl/images/{color}.png"> {accuracy_str}</td>\n'

self._table += "\t</tr>\n"
table += " </tr>\n"

self._table += '\t<tr>\n\t\t<td colspan="40"></td>\n\t</tr>\n'
self._table += "\t<tr>\n\t\t<td><strong>Mean</strong></td>\n"
table += f' <tr>\n <td colspan="{colspan}"></td>\n </tr>\n'
table += " <tr>\n <td><strong>Mean</strong></td>\n"

for column in self._columns:
for column in self._column_labels.keys():
accuracy_value = mean.loc[[column]].iloc[0]
color = self._get_square_color(accuracy_value)
self._table += f'\t\t<td><img src="https://raw.githubusercontent.com/pemistahl/lingua-py/pure-python-impl/images/{color}.png"> <strong>{accuracy_value}</strong></td>\n'
table += f' <td><img src="https://raw.githubusercontent.com/pemistahl/lingua-py/pure-python-impl/images/{color}.png"> <strong>{accuracy_value}</strong></td>\n'

self._table += "\t</tr>\n"
self._table += '\t<tr>\n\t\t<td colspan="40"></td>\n\t</tr>\n'
self._table += "\t<tr>\n\t\t<td>Median</td>\n"
table += " </tr>\n"
table += f' <tr>\n <td colspan="{colspan}"></td>\n </tr>\n'
table += " <tr>\n <td>Median</td>\n"

for column in self._columns:
for column in self._column_labels.keys():
accuracy_value = median.loc[[column]].iloc[0]
self._table += f"\t\t<td>{accuracy_value}</td>\n"
table += f" <td>{accuracy_value}</td>\n"

self._table += "\t</tr>\n"
self._table += "\t<tr>\n\t\t<td>Standard Deviation</td>\n"
table += " </tr>\n"
table += " <tr>\n <td>Standard Deviation</td>\n"

for column in self._columns:
for column in self._column_labels.keys():
accuracy_value = std.loc[[column]].iloc[0]
self._table += f"\t\t<td>{accuracy_value}</td>\n"
table += f" <td>{accuracy_value}</td>\n"

self._table += "\t</tr>\n"
self._table += "</table>"
table += " </tr>\n"
table += "</table>"

with open(
Path(__file__).parent / f"../{file_name}", mode="w"
) as accuracy_table_file:
accuracy_table_file.write(self._table)
with open(file_path, mode="w") as accuracy_table_file:
accuracy_table_file.write(table)

def _read_into_dataframe(self, file_path) -> pd.DataFrame:
frame = pd.read_csv(filepath_or_buffer=file_path).set_index("language")
return frame.reindex(sorted(frame.columns), axis=1)
def _read_into_dataframe(self, report_file_path: Path) -> pd.DataFrame:
return pd.read_csv(report_file_path, index_col="language")

def _get_square_color(self, accuracy_value: float):
def _get_square_color(self, accuracy_value: float) -> str:
if math.isnan(accuracy_value):
return "grey"
elif 0 <= accuracy_value <= 20:
Expand All @@ -193,9 +124,19 @@ def _get_square_color(self, accuracy_value: float):


if __name__ == "__main__":
file_path = (
Path(__file__).parent / "../accuracy-reports/aggregated-accuracy-values.csv"
)
writer = AccuracyTableWriter(file_path)
writer.write_accuracy_table(file_name="ACCURACY_TABLE.md")
print("Accuracy table written successfully")
report_directory_path = Path(__file__).parent / "../accuracy-reports"
table_directory_path = Path(__file__).parent / "../tables"
prefixes = ("average", "single-words", "word-pairs", "sentences")

for prefix in prefixes:
table_title = prefix.title().replace("-", " ")
table_file_name = prefix.upper().replace("-", "_")
writer = AccuracyTableWriter(
table_title=f"{table_title} Detection Performance",
report_file_path=report_directory_path / f"{prefix}-accuracy-values.csv",
)
writer.write_accuracy_table(
file_path=table_directory_path / f"{table_file_name}_ACCURACY_TABLE.md"
)

print("All accuracy tables created successfully")

0 comments on commit ef426d7

Please sign in to comment.