-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchinese_wc.py
executable file
·68 lines (50 loc) · 1.96 KB
/
chinese_wc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
#
import io
import os
import sys
# filename = "/home/wd/wordcount.log"
def count_file(filename):
f = io.open(filename, 'r', encoding='utf8')
# First find all 'normal' words and interpunction
# '[\x21-\x2f]' includes most interpunction, change it to ',' if you only need to match a comma
s = f.read()
count = 0
# count = len(re.findall(r'\\w+|[\\x21-\\x2]', s))
for word in s:
for ch in word:
# see https://stackoverflow.com/a/11415841/1248554 for additional ranges if needed
if 0x4e00 < ord(ch) < 0x9fcc:
count += 1
return count
# https://stackoverflow.com/questions/16528005/find-the-length-of-a-sentence-with-english-words-and-chinese-characters
def walk(top, maxdepth):
dirs, nondirs = [], []
for name in os.listdir(top):
(dirs if os.path.isdir(os.path.join(top, name)) else nondirs).append(name)
yield top, dirs, nondirs
if maxdepth > 1:
for name in dirs:
for x in walk(os.path.join(top, name), maxdepth-1):
yield x
# count all files in specific directory
# def count_directory(directory):
# ''' count files in a directory'''
if __name__ == "__main__":
# pdb.set_trace()
if len(sys.argv) != 4:
print("usage: python chinese_wc.py [dir] [depth] [appendix]\r\nlike: python chinese_wc.py ~/Documents/org 3 \'.org\'")
else:
# print("This is the name of the script: ", sys.argv[0])
# print("Number of arguments: ", len(sys.argv))
# print("The arguments are: " , str(sys.argv))
dir = sys.argv[1] # directory
appendix = sys.argv[3]
print("the directory is: ", dir)
files = walk(dir, int(sys.argv[2]))
for f in files:
# print(f)
for e in f[2]:
# print(e)
if e.endswith(appendix):
print("{count}\t{filename}".format(filename=f[0]+'/'+e,count=count_file(f[0]+'/'+e)))