From 803fce8d6f946b30002fc72e7209b48def581cd5 Mon Sep 17 00:00:00 2001
From: meg-huggingface <90473723+meg-huggingface@users.noreply.github.com>
Date: Wed, 16 Nov 2022 18:15:42 -0800
Subject: [PATCH 1/3] Changing how running through all modules is done

---
 run_data_measurements.py | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/run_data_measurements.py b/run_data_measurements.py
index 7724553..db45208 100644
--- a/run_data_measurements.py
+++ b/run_data_measurements.py
@@ -67,9 +67,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
     logs.info("Calculating vocab.")
     dstats.load_or_prepare_vocab()
 
-    if not calculation:
-        do_all = True
-
     if do_all or calculation == "general":
         logs.info("\n* Calculating general statistics.")
         dstats.load_or_prepare_general_stats()
@@ -77,7 +74,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
         logs.info(
             "Basic text statistics now available at %s." % dstats.general_stats_json_fid)
 
-    if do_all or calculation == "duplicates":
+    if calculation == "all" or calculation == "duplicates":
         logs.info("\n* Calculating text duplicates.")
         dstats.load_or_prepare_text_duplicates()
         duplicates_fid_dict = dstats.duplicates_files
@@ -85,7 +82,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
         for key, value in duplicates_fid_dict.items():
             logs.info("%s: %s" % (key, value))
 
-    if do_all or calculation == "lengths":
+    if calculation == "all" or calculation == "lengths":
         logs.info("\n* Calculating text lengths.")
         dstats.load_or_prepare_text_lengths()
         length_fid_dict = dstats.length_obj.get_filenames()
@@ -94,7 +91,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
             print("%s: %s" % (key, value))
         print()
 
-    if do_all or calculation == "labels":
+    if calculation == "all" or calculation == "labels":
         logs.info("\n* Calculating label statistics.")
         dstats.load_or_prepare_labels()
         npmi_fid_dict = dstats.label_files
@@ -103,7 +100,21 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
             print("%s: %s" % (key, value))
         print()
 
-    if do_all or calculation == "npmi":
+
+    if calculation == "all" or calculation == "zipf":
+        logs.info("\n* Preparing Zipf.")
+        dstats.load_or_prepare_zipf()
+        logs.info("Done!")
+        zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids(
+            dstats.dataset_cache_dir)
+        logs.info("Zipf results now available at %s." % zipf_json_fid)
+        logs.info(
+            "Figure saved to %s, with corresponding json at %s."
+            % (zipf_fig_html_fid, zipf_fig_json_fid)
+        )
+
+    # Don't do this one until someone specifically asks for it -- takes awhile.
+    if calculation == "npmi":
         print("\n* Preparing nPMI.")
         dstats.load_or_prepare_npmi()
         npmi_fid_dict = dstats.npmi_files
@@ -117,18 +128,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
                 print("%s: %s" % (key, value))
         print()
 
-    if do_all or calculation == "zipf":
-        logs.info("\n* Preparing Zipf.")
-        dstats.load_or_prepare_zipf()
-        logs.info("Done!")
-        zipf_json_fid, zipf_fig_json_fid, zipf_fig_html_fid = zipf.get_zipf_fids(
-            dstats.dataset_cache_dir)
-        logs.info("Zipf results now available at %s." % zipf_json_fid)
-        logs.info(
-            "Figure saved to %s, with corresponding json at %s."
-            % (zipf_fig_html_fid, zipf_fig_json_fid)
-        )
-
     # Don't do this one until someone specifically asks for it -- takes awhile.
     if calculation == "embeddings":
         logs.info("\n* Preparing text embeddings.")
@@ -210,6 +209,7 @@ def main():
     parser.add_argument(
         "-w",
         "--calculation",
+        default="all",
         help="""What to calculate (defaults to everything except embeddings and perplexities).\n
                                                     Options are:\n
 

From f7f3697a9cd63457d3cb1df0ba9c48ba5fc6dc05 Mon Sep 17 00:00:00 2001
From: meg-huggingface <90473723+meg-huggingface@users.noreply.github.com>
Date: Wed, 16 Nov 2022 18:17:37 -0800
Subject: [PATCH 2/3] variable value change

---
 run_data_measurements.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/run_data_measurements.py b/run_data_measurements.py
index db45208..b29f688 100644
--- a/run_data_measurements.py
+++ b/run_data_measurements.py
@@ -59,7 +59,6 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
     # TODO: Catch error exceptions for each measurement, so that an error
     # for one measurement doesn't break the calculation of all of them.
 
-    do_all = False
     dstats = dataset_statistics.DatasetStatisticsCacheClass(**dataset_args,
                                                             use_cache=use_cache)
     logs.info("Tokenizing dataset.")
@@ -67,7 +66,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
     logs.info("Calculating vocab.")
     dstats.load_or_prepare_vocab()
 
-    if do_all or calculation == "general":
+    if calculation == "all" or calculation == "general":
         logs.info("\n* Calculating general statistics.")
         dstats.load_or_prepare_general_stats()
         logs.info("Done!")

From 4fa1110e8f2fd97cef15516b7ab6f171bf36dbcc Mon Sep 17 00:00:00 2001
From: meg <90473723+meg-huggingface@users.noreply.github.com>
Date: Fri, 12 May 2023 16:22:03 -0700
Subject: [PATCH 3/3] Update run_data_measurements.py

---
 run_data_measurements.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/run_data_measurements.py b/run_data_measurements.py
index b29f688..07fb75a 100644
--- a/run_data_measurements.py
+++ b/run_data_measurements.py
@@ -112,8 +112,7 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
             % (zipf_fig_html_fid, zipf_fig_json_fid)
         )
 
-    # Don't do this one until someone specifically asks for it -- takes awhile.
-    if calculation == "npmi":
+    if calculation == "all" or calculation == "npmi":
         print("\n* Preparing nPMI.")
         dstats.load_or_prepare_npmi()
         npmi_fid_dict = dstats.npmi_files
@@ -127,12 +126,12 @@ def load_or_prepare(dataset_args, calculation=False, use_cache=False):
                 print("%s: %s" % (key, value))
         print()
 
-    # Don't do this one until someone specifically asks for it -- takes awhile.
+    # We removed this from the tool.
     if calculation == "embeddings":
         logs.info("\n* Preparing text embeddings.")
         dstats.load_or_prepare_embeddings()
 
-    # Don't do this one until someone specifically asks for it -- takes awhile.
+    # We removed this from the tool.
     if calculation == "perplexities":
         logs.info("\n* Preparing text perplexities.")
         dstats.load_or_prepare_text_perplexities()