From 1b9de37f9fdf9c071b4bcb6c82d8d3eef582c3c2 Mon Sep 17 00:00:00 2001 From: Miniyahil Kebede <53700166+hmhard@users.noreply.github.com> Date: Mon, 12 Aug 2024 12:58:38 +0300 Subject: [PATCH] code refactoring (#2) --- README.md | 2 +- image.png => assets/image.png | Bin clean_stop_words.py | 6 +- clear-non-alpha.py | 6 +- .../cleared_unwanted_keys.json | 0 .../english_keys.json | 0 .../final_filtered_data.json | 0 .../invalid_keys.json | 0 .../non_english_keys.json | 0 stop_words.json => outputs/stop_words.json | 0 valid_keys.json => outputs/valid_keys.json | 0 .../word_dictionary.json | 0 process.sh | 36 ++ remove-duplicates.py | 4 +- remove_unwanted_chars_from_keys.py | 4 +- separate.py | 6 +- top-500.json | 502 ++++++++++++++++++ top-words.py | 24 +- word_cat.py | 2 +- words-extractor.py | 7 +- 20 files changed, 571 insertions(+), 28 deletions(-) rename image.png => assets/image.png (100%) rename cleared_unwanted_keys.json => outputs/cleared_unwanted_keys.json (100%) rename english_keys.json => outputs/english_keys.json (100%) rename final_filtered_data.json => outputs/final_filtered_data.json (100%) rename invalid_keys.json => outputs/invalid_keys.json (100%) rename non_english_keys.json => outputs/non_english_keys.json (100%) rename stop_words.json => outputs/stop_words.json (100%) rename valid_keys.json => outputs/valid_keys.json (100%) rename word_dictionary.json => outputs/word_dictionary.json (100%) create mode 100644 process.sh create mode 100644 top-500.json diff --git a/README.md b/README.md index d2d3d7b..bc91abe 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ ## one of analysis dashboard -![Image](./image.png) +![Image](./assets/image.png) ### libraries used - bs4 diff --git a/image.png b/assets/image.png similarity index 100% rename from image.png rename to assets/image.png diff --git a/clean_stop_words.py b/clean_stop_words.py index 601bfc7..dd4562d 100644 --- a/clean_stop_words.py +++ b/clean_stop_words.py @@ -1,8 +1,8 @@ import json -data_file_path = 'cleared_unwanted_keys.json' -keys_to_remove_file_path = 'stop_words.json' -output_file_path = 'final_filtered_data.json' +data_file_path = 'outputs/cleared_unwanted_keys.json' +keys_to_remove_file_path = 'outputs/stop_words.json' +output_file_path = 'outputs/final_filtered_data.json' with open(data_file_path, 'r', encoding='utf-8') as file: key_value_pairs = json.load(file) diff --git a/clear-non-alpha.py b/clear-non-alpha.py index 06b7c4a..cf2ea2a 100644 --- a/clear-non-alpha.py +++ b/clear-non-alpha.py @@ -1,10 +1,10 @@ import json import re -file_path = 'word_dictionary.json' +file_path = 'outputs/word_dictionary.json' -valid_keys_file = 'valid_keys.json' -invalid_keys_file = 'invalid_keys.json' +valid_keys_file = 'outputs/valid_keys.json' +invalid_keys_file = 'outputs/invalid_keys.json' with open(file_path, 'r', encoding='utf-8') as file: data = json.load(file) diff --git a/cleared_unwanted_keys.json b/outputs/cleared_unwanted_keys.json similarity index 100% rename from cleared_unwanted_keys.json rename to outputs/cleared_unwanted_keys.json diff --git a/english_keys.json b/outputs/english_keys.json similarity index 100% rename from english_keys.json rename to outputs/english_keys.json diff --git a/final_filtered_data.json b/outputs/final_filtered_data.json similarity index 100% rename from final_filtered_data.json rename to outputs/final_filtered_data.json diff --git a/invalid_keys.json b/outputs/invalid_keys.json similarity index 100% rename from invalid_keys.json rename to outputs/invalid_keys.json diff --git a/non_english_keys.json b/outputs/non_english_keys.json similarity index 100% rename from non_english_keys.json rename to outputs/non_english_keys.json diff --git a/stop_words.json b/outputs/stop_words.json similarity index 100% rename from stop_words.json rename to outputs/stop_words.json diff --git a/valid_keys.json b/outputs/valid_keys.json similarity index 100% rename from valid_keys.json rename to outputs/valid_keys.json diff --git a/word_dictionary.json b/outputs/word_dictionary.json similarity index 100% rename from word_dictionary.json rename to outputs/word_dictionary.json diff --git a/process.sh b/process.sh new file mode 100644 index 0000000..69758ca --- /dev/null +++ b/process.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +top_n="$1" + +python_files=( + # "words-extractor.py" + # "clear-non-alpha.py" + # "separate.py" + # "remove_unwanted_chars_from_keys.py" + "clean_stop_words.py" +) + +for file in "${python_files[@]}" +do + echo "Running $file..." + python "$file" + if [ $? -ne 0 ]; then + echo "Error: $file failed to run." + exit 1 + fi + echo "$file completed successfully." +done + +echo "Running TOP $top_n..." +python "top-words.py" $top_n +if [ $? -ne 0 ]; then + echo "Error: $last_file failed to run." + exit 1 +fi + +echo "All Processing is done executed." diff --git a/remove-duplicates.py b/remove-duplicates.py index c7383c7..cbde3ff 100644 --- a/remove-duplicates.py +++ b/remove-duplicates.py @@ -1,10 +1,10 @@ import json # Path to the input JSON file containing an array of words -input_file_path = 'stop_words.json' +input_file_path = 'outputs/stop_words.json' # Path to the output JSON file where unique words will be saved -output_file_path = 'unique_words.json' +output_file_path = 'outputs/unique_words.json' # Open and load the JSON file with open(input_file_path, 'r', encoding='utf-8') as file: diff --git a/remove_unwanted_chars_from_keys.py b/remove_unwanted_chars_from_keys.py index b6d3906..ec9c63e 100644 --- a/remove_unwanted_chars_from_keys.py +++ b/remove_unwanted_chars_from_keys.py @@ -2,10 +2,10 @@ import re # Path to the input JSON file with key-value pairs -input_file_path = 'non_english_keys.json' +input_file_path = 'outputs/non_english_keys.json' # Path to the output JSON file where filtered key-value pairs will be saved -output_file_path = 'cleared_unwanted_keys.json' +output_file_path = 'outputs/cleared_unwanted_keys.json' # Regular expressions for identifying unwanted characters in keys symbols_pattern = re.compile(r'[^\w\s]', re.UNICODE) # Matches any symbols (excluding alphanumeric and whitespace) diff --git a/separate.py b/separate.py index 739d5b9..9ac8c23 100644 --- a/separate.py +++ b/separate.py @@ -1,11 +1,11 @@ import json import re -file_path = 'valid_keys.json' +file_path = 'outputs/valid_keys.json' # Output file paths -english_keys_file = 'english_keys.json' -non_english_keys_file = 'non_english_keys.json' +english_keys_file = 'outputs/english_keys.json' +non_english_keys_file = 'outputs/non_english_keys.json' # Open and load the JSON file with open(file_path, 'r', encoding='utf-8') as file: diff --git a/top-500.json b/top-500.json new file mode 100644 index 0000000..12d2708 --- /dev/null +++ b/top-500.json @@ -0,0 +1,502 @@ +{ + "ሰዎች": 14109, + "ከተማ": 10457, + "ክልል": 9968, + "አቶ": 9289, + "ቤት": 8725, + "አበባ": 7661, + "ሲሆን": 7288, + "ዛሬ": 6636, + "መሆኑን": 6383, + "ተማሪዎች": 5937, + "ቀን": 5896, + "ከፍተኛ": 5741, + "የኢትዮጵያ": 5684, + "ሰዓት": 5294, + "ፖሊስ": 5167, + "እንዲሁም": 5136, + "መንግስት": 4997, + "ትምህርት": 4992, + "መረጃ": 4845, + "ስራ": 4820, + "ብር": 4673, + "ኢትዮጵያ": 4528, + "ቫይረስ": 4229, + "ደግሞ": 4185, + "ቁጥር": 4154, + "ሚኒስትር": 4130, + "አገልግሎት": 4026, + "ዞን": 3999, + "ዩኒቨርሲቲ": 3820, + "ዓመት": 3709, + "ሚኒስቴር": 3588, + "አባላት": 3540, + "ችግር": 3503, + "ጠቅላይ": 3370, + "ሺህ": 3345, + "ሲሉ": 3280, + "መግለጫ": 3193, + "ጉዳት": 3151, + "አካላት": 3151, + "ምርመራ": 3099, + "አስተዳደር": 3060, + "ምክር": 3009, + "ወረዳ": 2953, + "ደረጃ": 2935, + "ድጋፍ": 2896, + "አካባቢ": 2883, + "ዶክተር": 2824, + "ሀገር": 2797, + "አቀፍ": 2726, + "መሰረት": 2687, + "ድረስ": 2654, + "ክፍል": 2564, + "ተቋማት": 2547, + "ልዩ": 2541, + "በኢትዮጵያ": 2526, + "ዕለት": 2491, + "ቤቱ": 2489, + "ቢሮ": 2480, + "እንደሆነ": 2456, + "ጉዳዮች": 2446, + "የትምህርት": 2393, + "ቤቶች": 2378, + "ያሉ": 2372, + "ምክትል": 2363, + "ጥቃት": 2360, + "ጨምሮ": 2360, + "አደጋ": 2343, + "በቁጥጥር": 2340, + "ቀናት": 2336, + "በቫይረሱ": 2326, + "ህይወት": 2304, + "ሰላም": 2293, + "ህዝብ": 2287, + "በአዲስ": 2278, + "ጥሪ": 2270, + "ወር": 2235, + "ሚሊዮን": 2215, + "ውይይት": 2209, + "በሚል": 2180, + "ተብሎ": 2164, + "የውጭ": 2153, + "ምንም": 2146, + "ግጭት": 2141, + "ፍርድ": 2136, + "ምርጫ": 2118, + "አህመድ": 2084, + "ዜጎች": 2082, + "በተለያዩ": 2073, + "ፈተና": 2040, + "የፀጥታ": 2026, + "ጤና": 2011, + "የአዲስ": 2004, + "አካባቢዎች": 1970, + "ጥያቄ": 1962, + "የስራ": 1952, + "ተከትሎ": 1943, + "ኮሚሽን": 1926, + "መንግሥት": 1924, + "በኮሮና": 1913, + "ድርጅት": 1897, + "በማድረግ": 1890, + "ውሳኔ": 1876, + "ኃይል": 1866, + "ያለውን": 1856, + "እስካሁን": 1852, + "ባለፉት": 1833, + "ነዋሪዎች": 1829, + "ያሉት": 1822, + "መያዛቸው": 1815, + "ይፋ": 1809, + "ለማድረግ": 1805, + "ሥራ": 1803, + "አካል": 1788, + "ይህን": 1783, + "ሪፖርት": 1776, + "ቃል": 1756, + "የሚገኙ": 1746, + "የክልሉ": 1735, + "ግንኙነት": 1731, + "ሆነው": 1727, + "አብይ": 1725, + "የጤና": 1721, + "በሙሉ": 1709, + "የነበሩ": 1672, + "በመሆኑ": 1652, + "የኮሮና": 1641, + "ኃላፊ": 1628, + "አጠቃላይ": 1618, + "ውጤት": 1617, + "እርምጃ": 1604, + "መሆኑ": 1594, + "ጥረት": 1593, + "የመንግስት": 1593, + "ቡድን": 1587, + "ሀገራት": 1586, + "በዛሬው": 1572, + "የሰላም": 1557, + "ዓለም": 1556, + "ዝግጅት": 1553, + "አየር": 1534, + "መልዕክት": 1532, + "ፓርቲ": 1521, + "የተያዙ": 1518, + "ምላሽ": 1514, + "ቀደም": 1488, + "ኮሚቴ": 1483, + "ግለሰቦች": 1470, + "ዋጋ": 1464, + "በዓል": 1438, + "ወጣቶች": 1425, + "ወንጀል": 1417, + "ሆስፒታል": 1416, + "በተመለከተ": 1413, + "እንቅስቃሴ": 1412, + "የላብራቶሪ": 1410, + "አመት": 1404, + "አመራሮች": 1401, + "ፕሬዝዳንት": 1400, + "መከላከያ": 1386, + "የሚል": 1376, + "በኃላ": 1372, + "ሆኖ": 1362, + "ባንክ": 1357, + "የፖለቲካ": 1347, + "በተጨማሪ": 1333, + "ብሔራዊ": 1315, + "ዐቢይ": 1310, + "የሆነ": 1307, + "ከተሞች": 1290, + "በተያያዘ": 1290, + "የትግራይ": 1286, + "ሳምንት": 1279, + "መልኩ": 1274, + "ብሎ": 1264, + "የአማራ": 1258, + "ባለው": 1241, + "በትግራይ": 1239, + "ልማት": 1235, + "ኃይሎች": 1233, + "ጉባኤ": 1219, + "አባል": 1218, + "ድምፅ": 1211, + "ሂደት": 1205, + "መብት": 1201, + "ህግ": 1192, + "ደህንነት": 1192, + "ገንዘብ": 1178, + "ሚኒስትሩ": 1176, + "ጦርነት": 1170, + "ያላቸው": 1160, + "በመቶ": 1158, + "ስብሰባ": 1151, + "የህዝብ": 1150, + "ሚዲያ": 1142, + "ዳይሬክተር": 1133, + "በሰላም": 1120, + "ስጋት": 1113, + "የአሜሪካ": 1103, + "ቤተሰብ": 1101, + "ስምምነት": 1096, + "አደባባይ": 1095, + "ዓመታት": 1088, + "ሰላማዊ": 1084, + "ወራት": 1079, + "ሰልፍ": 1077, + "የአፍሪካ": 1075, + "መረጃዎች": 1072, + "በመሆን": 1071, + "ሳይሆን": 1071, + "ቦርድ": 1067, + "ህይወታቸው": 1061, + "ጣቢያ": 1055, + "ተወካዮች": 1045, + "በክልሉ": 1045, + "ክልሎች": 1040, + "ብሄራዊ": 1034, + "የህክምና": 1032, + "አፍሪካ": 1032, + "ከባድ": 1027, + "ህብረት": 1026, + "ተግባር": 1023, + "ዘመን": 1014, + "በማለት": 1014, + "መፍትሄ": 1014, + "አምባሳደር": 1013, + "የነበረ": 1002, + "በጋራ": 1000, + "በሰጡት": 997, + "ህክምና": 986, + "ሱዳን": 985, + "መሆናቸውን": 984, + "እርዳታ": 984, + "መደበኛ": 984, + "የሆኑ": 982, + "ሰዎችን": 981, + "ምሽት": 981, + "ግንባታ": 979, + "ትኩረት": 979, + "ከበሽታው": 975, + "ዙር": 974, + "ተማሪዎችን": 972, + "ፕሮግራም": 967, + "ግለሰብ": 965, + "ሰራዊት": 959, + "ሰራተኞች": 954, + "የጦር": 949, + "የምርጫ": 944, + "እየተካሄደ": 944, + "አንዱ": 942, + "ስም": 940, + "እየተደረገ": 939, + "የጋራ": 938, + "ንብረት": 937, + "የኦሮሚያ": 933, + "ዘርፍ": 931, + "ሜትር": 930, + "ንግግር": 928, + "የሆኑት": 913, + "ነጥብ": 910, + "ይህንን": 900, + "ጥንቃቄ": 900, + "ቦታዎች": 896, + "ክትትል": 895, + "ክስ": 885, + "የሀገር": 884, + "ተማሪ": 881, + "አዋጅ": 881, + "ዝርዝር": 880, + "የሰው": 878, + "ትግራይ": 876, + "የጸጥታ": 870, + "መልካም": 868, + "አድርጎ": 868, + "አሜሪካ": 866, + "ክፍለ": 865, + "የንግድ": 862, + "የዓለም": 859, + "የፌደራል": 858, + "ተደርጎ": 857, + "ያገገሙ": 855, + "ስርዓት": 854, + "በሀገሪቱ": 852, + "ተቋም": 848, + "የቲክቫህ": 847, + "በዚህም": 845, + "ወገኖች": 844, + "አራት": 841, + "እለት": 838, + "አስቸኳይ": 837, + "ሶስት": 834, + "ማዕከል": 834, + "ግድብ": 832, + "ቀበሌ": 831, + "ጦር": 829, + "ሺ": 825, + "ታሪክ": 822, + "ኢትዮጵያውያን": 822, + "ከንቲባ": 821, + "ችግሮች": 817, + "ድርጅቶች": 812, + "ማህበረሰብ": 809, + "ለመከላከል": 806, + "አቅም": 805, + "አገር": 803, + "የሰብዓዊ": 803, + "ከአዲስ": 802, + "ባለፈው": 802, + "ፕሬዚዳንት": 800, + "ቤተ": 798, + "ውጪ": 796, + "በአማራ": 795, + "መሪ": 793, + "መድረክ": 793, + "መሠረት": 790, + "በአሁኑ": 788, + "መስቀል": 788, + "ዶላር": 786, + "የመጀመሪያ": 785, + "የግል": 784, + "በአጠቃላይ": 778, + "ችሎት": 778, + "ምግብ": 775, + "መምሪያ": 775, + "ጋዜጠኛ": 774, + "ነዋሪ": 773, + "ኤፍ": 772, + "በአካባቢው": 771, + "ለመስጠት": 770, + "ርዕሰ": 770, + "የሥራ": 767, + "ብለው": 766, + "መኪና": 765, + "መጠን": 765, + "ዩኒቨርሲቲዎች": 764, + "መስጠት": 764, + "የምግብ": 760, + "ነፃ": 759, + "የሚገኘው": 758, + "የመከላከያ": 754, + "መኖሪያ": 752, + "ስራዎች": 751, + "ሞት": 749, + "የፌዴራል": 747, + "የነበረውን": 744, + "በቀጣይ": 742, + "ቤተሰቦች": 737, + "አመራር": 736, + "ጎንደር": 734, + "ተግባራዊ": 731, + "በማህበራዊ": 730, + "አስተዳደሩ": 730, + "መንገዶች": 727, + "ድርጊት": 725, + "ከኢትዮጵያ": 725, + "መስከረም": 724, + "ወጥ": 723, + "መሳሪያ": 720, + "በደቡብ": 720, + "ሰኔ": 720, + "በከፍተኛ": 720, + "ቁጥጥር": 718, + "ደቡብ": 717, + "መረጃዎችን": 716, + "ሴቶች": 715, + "ሚሊየን": 714, + "ባለሙያዎች": 714, + "ሆቴል": 710, + "በቂ": 709, + "በበሽታው": 709, + "ንግድ": 708, + "ቫይረሱ": 706, + "እንዲሆን": 705, + "ባወጣው": 705, + "ክፍሎች": 704, + "ፓርቲዎች": 704, + "ትልቅ": 701, + "መቶ": 696, + "መስሪያ": 693, + "አስመልክቶ": 692, + "ቀጠሮ": 692, + "ሚዲያዎች": 689, + "ለመፍታት": 689, + "ግብር": 688, + "ሳይንስ": 685, + "ይህም": 684, + "በኦሮሚያ": 684, + "ጉዞ": 680, + "በነበረው": 678, + "ወታደራዊ": 677, + "የሚያስችል": 673, + "እንደሚገኝ": 673, + "ሕዝብ": 673, + "የገንዘብ": 671, + "የነበሩት": 670, + "የሀገሪቱ": 669, + "ኪሎ": 668, + "መሆኑንም": 667, + "በሁሉም": 666, + "ባለስልጣናት": 664, + "ጥበቃ": 664, + "የደቡብ": 661, + "የሆነው": 660, + "ህብረተሰቡ": 660, + "ወረርሽኝ": 660, + "ዩኒቨርስቲ": 654, + "ህዝቦች": 653, + "የሟቾች": 652, + "ውድድር": 651, + "ቴክኖሎጂ": 650, + "አውሮፕላን": 645, + "በሆነ": 645, + "እየተሰራ": 644, + "ግንቦት": 644, + "አገራት": 644, + "ባለስልጣን": 643, + "የአየር": 642, + "ይዞ": 641, + "ስልክ": 629, + "ወረዳዎች": 629, + "ሲሆኑ": 625, + "ጽህፈት": 625, + "ግቢ": 624, + "እየሰራ": 623, + "የእሳት": 622, + "ስነ": 621, + "ያላቸውን": 619, + "ለኢትዮጵያ": 617, + "ማህበር": 617, + "መገናኛ": 616, + "መስተዳድር": 616, + "የከተማ": 615, + "የኦሮሞ": 614, + "ምዝገባ": 612, + "ኤጀንሲ": 611, + "የህግ": 610, + "ተሽከርካሪዎች": 610, + "እጥረት": 609, + "ዕጣ": 609, + "ማህበራዊ": 607, + "ፈቃድ": 604, + "ጭምር": 603, + "በተለይም": 601, + "በጀት": 599, + "ስርጭት": 598, + "ከእስር": 598, + "ኮሚሽነር": 598, + "እያደረገ": 597, + "ማብራሪያ": 594, + "አስፈላጊውን": 594, + "ጥቅምት": 593, + "ክልላዊ": 591, + "በሌሎች": 589, + "ኢትዮጵያን": 588, + "በይፋ": 588, + "ሀሰተኛ": 588, + "የቤት": 587, + "ሕግ": 586, + "ክፍያ": 583, + "ለህዝብ": 582, + "ተስፋ": 581, + "አቀባበል": 581, + "በሚገኘው": 580, + "ነዳጅ": 579, + "ሬድዮ": 578, + "የአንድ": 577, + "ውሃ": 576, + "አዲሱ": 576, + "ወቅታዊ": 575, + "ትራንስፖርት": 573, + "ግንባር": 571, + "የትራንስፖርት": 570, + "የነዳጅ": 570, + "ልጆች": 568, + "ተገኝተው": 568, + "የምርመራ": 568, + "መሪዎች": 566, + "ስለሆነ": 565, + "ተጠርጣሪዎች": 564, + "ታከለ": 563, + "ጉሙዝ": 562, + "የማህበራዊ": 561, + "የሚሆኑ": 560, + "አስፈፃሚ": 560, + "በቀለ": 559, + "ደብዳቤ": 559, + "ገቢ": 559, + "ቅድመ": 558, + "ምንጭ": 556, + "በተደረገው": 556, + "መቐለ": 556, + "ወጣት": 554, + "ሥር": 553, + "ሴት": 552, + "በሀገራችን": 550, + "ጥያቄዎች": 549, + "ጉብኝት": 549, + "ታደሰ": 548, + "ዝግጁ": 547, + "ማቆያ": 545, + "ቴሌቪዥን": 544 +} \ No newline at end of file diff --git a/top-words.py b/top-words.py index 6b2f460..839c41b 100644 --- a/top-words.py +++ b/top-words.py @@ -1,16 +1,22 @@ import json +import argparse # Path to the JSON file -file_path = 'final_filtered_data.json' +file_path = 'outputs/final_filtered_data.json' -with open(file_path, 'r', encoding='utf-8') as file: - data = json.load(file) +def main(top_n): + with open(file_path, 'r', encoding='utf-8') as file: + data = json.load(file) + top = dict(sorted(data.items(), key=lambda item: item[1], reverse=True)[:top_n]) -top_n = 500 + with open("top-"+str(top_n)+".json", 'w', encoding='utf-8') as file: + json.dump(top, file, ensure_ascii=False, indent=4) + -top = dict(sorted(data.items(), key=lambda item: item[1], reverse=True)[:100]) - -print(json.dumps(top, indent=4)) -with open("top-"+str(top_n)+".json", 'w', encoding='utf-8') as file: - json.dump(top, file, ensure_ascii=False, indent=4) \ No newline at end of file +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process top n.") + + parser.add_argument('top_n', type=int, help='An integer representing the top N value') + args = parser.parse_args() + main(args.top_n) \ No newline at end of file diff --git a/word_cat.py b/word_cat.py index 2bbfe2c..cb9108c 100644 --- a/word_cat.py +++ b/word_cat.py @@ -1,7 +1,7 @@ import json import re -file_path = 'word_dictionary.json' +file_path = 'outputs/word_dictionary.json' # Open and load the JSON file with open(file_path, 'r', encoding='utf-8') as file: diff --git a/words-extractor.py b/words-extractor.py index 44b719e..109e6df 100644 --- a/words-extractor.py +++ b/words-extractor.py @@ -8,10 +8,9 @@ # Function to clean and split text into words def extract_words(text): - # Remove punctuation, convert to lowercase, and split into words - words = text.lower().split() + words_list = text.lower().split() # NOTE: additional processing will be addede later - return words + return words_list # Initialize a defaultdict to store word counts word_dict = defaultdict(int) @@ -38,7 +37,7 @@ def extract_words(text): word_dict = dict(word_dict) # Save the word dictionary to a JSON file -output_file = 'word_dictionary.json' +output_file = 'outputs/word_dictionary.json' with open(output_file, 'w', encoding='utf-8') as json_file: json.dump(word_dict, json_file, ensure_ascii=False, indent=4)