From 803fce8d6f946b30002fc72e7209b48def581cd5 Mon Sep 17 00:00:00 2001 From: meg-huggingface <90473723+meg-huggingface@users.noreply.github.com> Date: Wed, 16 Nov 2022 18:15:42 -0800 Subject: [PATCH 1/3] Changing how running through all modules is done --- run_data_measurements.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/run_data_measurements.py b/run_data_measurements.py index 7724553..db45208 100644 --- a/run_data_measurements.py +++ b/run_data_measurements.py @@ -67,9 +67,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): logs.info("Calculating vocab.") dstats.load_or_prepare_vocab() - if not calculation: - do_all = True - if do_all or calculation == "general": logs.info("\n* Calculating general statistics.") dstats.load_or_prepare_general_stats() @@ -77,7 +74,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): logs.info( "Basic text statistics now available at %s." % dstats.general_stats_json_fid) - if do_all or calculation == "duplicates": + if calculation == "all" or calculation == "duplicates": logs.info("\n* Calculating text duplicates.") dstats.load_or_prepare_text_duplicates() duplicates_fid_dict = dstats.duplicates_files @@ -85,7 +82,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): for key, value in duplicates_fid_dict.items(): logs.info("%s: %s" % (key, value)) - if do_all or calculation == "lengths": + if calculation == "all" or calculation == "lengths": logs.info("\n* Calculating text lengths.") dstats.load_or_prepare_text_lengths() length_fid_dict = dstats.length_obj.get_filenames() @@ -94,7 +91,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): print("%s: %s" % (key, value)) print() - if do_all or calculation == "labels": + if calculation == "all" or calculation == "labels": logs.info("\n* Calculating label statistics.") dstats.load_or_prepare_labels() npmi_fid_dict = dstats.label_files @@ -103,7 +100,21 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): print("%s: %s" % (key, value)) print() - if do_all or calculation == "npmi": + + if calculation == "all" or calculation == "zipf": + logs.info("\n* Preparing Zipf.") + dstats.load_or_prepare_zipf() + logs.info("Done!") + zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids( + dstats.dataset_cache_dir) + logs.info("Zipf results now available at %s." % zipf_json_fid) + logs.info( + "Figure saved to %s, with corresponding json at %s." + % (zipf_fig_html_fid, zipf_fig_json_fid) + ) + + # Don't do this one until someone specifically asks for it -- takes awhile. + if calculation == "npmi": print("\n* Preparing nPMI.") dstats.load_or_prepare_npmi() npmi_fid_dict = dstats.npmi_files @@ -117,18 +128,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): print("%s: %s" % (key, value)) print() - if do_all or calculation == "zipf": - logs.info("\n* Preparing Zipf.") - dstats.load_or_prepare_zipf() - logs.info("Done!") - zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids( - dstats.dataset_cache_dir) - logs.info("Zipf results now available at %s." % zipf_json_fid) - logs.info( - "Figure saved to %s, with corresponding json at %s." - % (zipf_fig_html_fid, zipf_fig_json_fid) - ) - # Don't do this one until someone specifically asks for it -- takes awhile. if calculation == "embeddings": logs.info("\n* Preparing text embeddings.") @@ -210,6 +209,7 @@ def main(): parser.add_argument( "-w", "--calculation", + default="all", help="""What to calculate (defaults to everything except embeddings and perplexities).\n Options are:\n From f7f3697a9cd63457d3cb1df0ba9c48ba5fc6dc05 Mon Sep 17 00:00:00 2001 From: meg-huggingface <90473723+meg-huggingface@users.noreply.github.com> Date: Wed, 16 Nov 2022 18:17:37 -0800 Subject: [PATCH 2/3] variable value change --- run_data_measurements.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/run_data_measurements.py b/run_data_measurements.py index db45208..b29f688 100644 --- a/run_data_measurements.py +++ b/run_data_measurements.py @@ -59,7 +59,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): # TODO: Catch error exceptions for each measurement, so that an error # for one measurement doesn't break the calculation of all of them. - do_all = False dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args, use_cache=use_cache) logs.info("Tokenizing dataset.") @@ -67,7 +66,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): logs.info("Calculating vocab.") dstats.load_or_prepare_vocab() - if do_all or calculation == "general": + if calculation == "all" or calculation == "general": logs.info("\n* Calculating general statistics.") dstats.load_or_prepare_general_stats() logs.info("Done!") From 4fa1110e8f2fd97cef15516b7ab6f171bf36dbcc Mon Sep 17 00:00:00 2001 From: meg <90473723+meg-huggingface@users.noreply.github.com> Date: Fri, 12 May 2023 16:22:03 -0700 Subject: [PATCH 3/3] Update run_data_measurements.py --- run_data_measurements.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/run_data_measurements.py b/run_data_measurements.py index b29f688..07fb75a 100644 --- a/run_data_measurements.py +++ b/run_data_measurements.py @@ -112,8 +112,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): % (zipf_fig_html_fid, zipf_fig_json_fid) ) - # Don't do this one until someone specifically asks for it -- takes awhile. - if calculation == "npmi": + if calculation == "all" or calculation == "npmi": print("\n* Preparing nPMI.") dstats.load_or_prepare_npmi() npmi_fid_dict = dstats.npmi_files @@ -127,12 +126,12 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False): print("%s: %s" % (key, value)) print() - # Don't do this one until someone specifically asks for it -- takes awhile. + # We removed this from the tool. if calculation == "embeddings": logs.info("\n* Preparing text embeddings.") dstats.load_or_prepare_embeddings() - # Don't do this one until someone specifically asks for it -- takes awhile. + # We removed this from the tool. if calculation == "perplexities": logs.info("\n* Preparing text perplexities.") dstats.load_or_prepare_text_perplexities()