-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathLatvianStemmer.py
168 lines (146 loc) · 4.44 KB
/
LatvianStemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
__author__ = 'rihards'
import fileinput
'''
Original Java code can be found in /~https://github.com/apache/lucene-solr
Ported to Python by Rihards Krišlauks with minor modifications
* Light stemmer for Latvian.
* <p>
* This is a light version of the algorithm in Karlis Kreslin's PhD thesis
* <i>A stemming algorithm for Latvian</i> with the following modifications:
* <ul>
* <li>Only explicitly stems noun and adjective morphology
* <li>Stricter length/vowel checks for the resulting stems (verb etc suffix stripping is removed)
* <li>Removes only the primary inflectional suffixes: case and number for nouns
* case, number, gender, and definitiveness for adjectives.
* <li>Palatalization is only handled when a declension II,V,VI noun suffix is removed.
* </ul>
'''
class Affix:
def __init__(self, affix, vc, palatalizes):
self.affix = affix
self.vc = vc
self.palatalizes = palatalizes
'''
Stem a latvian word. returns the new adjusted length.
'''
affixes = [
Affix("ajiem", 3, False),
Affix("ajai", 3, False),
Affix("ajam", 2, False),
Affix("ajām", 2, False),
Affix("ajos", 2, False),
Affix("ajās", 2, False),
Affix("iem", 2, True),
Affix("ajā", 2, False),
Affix("ais", 2, False),
Affix("ai", 2, False),
Affix("ei", 2, False),
Affix("ām", 1, False),
Affix("am", 1, False),
Affix("ēm", 1, False),
Affix("īm", 1, False),
Affix("im", 1, False),
Affix("um", 1, False),
Affix("us", 1, True),
Affix("as", 1, False),
Affix("ās", 1, False),
Affix("es", 1, False),
Affix("os", 1, True),
Affix("ij", 1, False),
Affix("īs", 1, False),
Affix("ēs", 1, False),
Affix("is", 1, False),
Affix("ie", 1, False),
Affix("u", 1, True),
Affix("a", 1, True),
Affix("i", 1, True),
Affix("e", 1, False),
Affix("ā", 1, False),
Affix("ē", 1, False),
Affix("ī", 1, False),
Affix("ū", 1, False),
Affix("o", 1, False),
Affix("s", 0, False),
Affix("š", 0, False)
]
'''
* Most cases are handled except for the ambiguous ones:
* <ul>
* <li> s -> š
* <li> t -> š
* <li> d -> ž
* <li> z -> ž
* </ul>
*/
'''
def un_palatalize(s, length):
# we check the character removed: if its -u then
# its 2,5, or 6 gen pl., and these two can only apply then.
if s[length] == 'u':
# kš -> kst
if endswith(s, length, "kš"):
length += 1
s[length - 2] = 's'
s[length - 1] = 't'
return s[:length]
elif endswith(s, length, "ņņ"):
s[length - 2] = 'n'
s[length - 1] = 'n'
return s[:length]
# otherwise all other rules
if endswith(s, length, "pj")\
or endswith(s, length, "bj")\
or endswith(s, length, "mj")\
or endswith(s, length, "vj"):
# labial consonant
length = length - 1
elif endswith(s, length, "šņ"):
s[length - 2] = 's'
s[length - 1] = 'n'
elif endswith(s, length, "žņ"):
s[length - 2] = 'z'
s[length - 1] = 'n'
elif endswith(s, length, "šļ"):
s[length - 2] = 's'
s[length - 1] = 'l'
elif endswith(s, length, "žļ"):
s[length - 2] = 'z'
s[length - 1] = 'l'
elif endswith(s, length, "ļņ"):
s[length - 2] = 'l'
s[length - 1] = 'n'
elif endswith(s, length, "ļļ"):
s[length - 2] = 'l'
s[length - 1] = 'l'
elif s[length - 1] == 'č':
s[length - 1] = 'c'
elif s[length - 1] == 'ļ':
s[length - 1] = 'l'
elif s[length - 1] == 'ņ':
s[length - 1] = 'n'
return s[:length]
def endswith(s, length, suffix):
return "".join(s[:length]).endswith(suffix)
def num_vowels(s):
vowels = {}.fromkeys('aāeēiīouūAĀEĒIĪOUŪ')
count = 0
for char in s:
if char in vowels:
count += 1
return count
def stem(s):
s = list(s)
numvowels = num_vowels(s)
length = len(s)
for affix in affixes:
if numvowels > affix.vc and length >= len(affix.affix) + 3 and endswith(s, length, affix.affix):
length -= len(affix.affix)
s = un_palatalize(s, length) if affix.palatalizes else s[:length]
break
return ''.join(s)
def main():
for line in fileinput.input():
stems = map(stem, line.rstrip().split())
print(' '.join(stems))
if __name__ == "__main__":
main()