From c0a2296b8259334166afaa083fa6310fffb1b670 Mon Sep 17 00:00:00 2001 From: har07 Date: Thu, 14 Jan 2016 20:39:07 +0700 Subject: [PATCH] refactor method and variable names to follow python convention: lower-case words separated by underscores --- src/Sastrawi/Dictionary/ArrayDictionary.py | 4 +- .../InvalidAffixPairSpecification.py | 6 +- src/Sastrawi/Stemmer/CachedStemmer.py | 4 +- .../PrecedenceAdjustmentSpecification.py | 6 +- src/Sastrawi/Stemmer/Context/Context.py | 166 ++++++++---------- src/Sastrawi/Stemmer/Context/Removal.py | 12 +- .../Stemmer/Context/RemovalInterface.py | 10 +- .../Visitor/AbstractDisambiguatePrefixRule.py | 18 +- .../Context/Visitor/DontStemShortWord.py | 4 +- .../Context/Visitor/PrefixDisambiguator.py | 2 +- .../Visitor/RemoveDerivationalSuffix.py | 12 +- .../Visitor/RemoveInflectionalParticle.py | 12 +- .../RemoveInflectionalPossessivePronoun.py | 12 +- .../Context/Visitor/RemovePlainPrefix.py | 12 +- .../Context/Visitor/VisitorProvider.py | 110 ++++++------ src/Sastrawi/Stemmer/Filter/TextNormalizer.py | 2 +- src/Sastrawi/Stemmer/Stemmer.py | 32 ++-- src/Sastrawi/Stemmer/StemmerFactory.py | 10 +- .../StopWordRemover/StopWordRemover.py | 2 +- .../StopWordRemover/StopWordRemoverFactory.py | 6 +- tests/FunctionalTests/Stemmer/StemmerTest.py | 8 +- tests/IntegrationTests/Stemmer/StemmerTest.py | 10 +- .../Dictionary/ArrayDictionaryTest.py | 12 +- .../InvalidAffixPairSpecificationTest.py | 18 +- tests/UnitTests/Stemmer/StemmerFactoryTest.py | 6 +- tests/UnitTests/Stemmer/StemmerTest.py | 4 +- .../StopWordRemoverFactoryTest.py | 2 +- .../StopWordRemover/StopWordRemoverTest.py | 2 +- 28 files changed, 240 insertions(+), 264 deletions(-) diff --git a/src/Sastrawi/Dictionary/ArrayDictionary.py b/src/Sastrawi/Dictionary/ArrayDictionary.py index 43e46f7..5819a29 100644 --- a/src/Sastrawi/Dictionary/ArrayDictionary.py +++ b/src/Sastrawi/Dictionary/ArrayDictionary.py @@ -4,7 +4,7 @@ class ArrayDictionary(object): def __init__(self, words=None): self.words = [] if words: - self.addWords(words) + self.add_words(words) def contains(self, word): return word in self.words @@ -12,7 +12,7 @@ def contains(self, word): def count(self): return len(self.words) - def addWords(self, words): + def add_words(self, words): """Add multiple words to the dictionary""" for word in words: self.add(word) diff --git a/src/Sastrawi/Morphology/InvalidAffixPairSpecification.py b/src/Sastrawi/Morphology/InvalidAffixPairSpecification.py index 9c7b8c9..578fa1f 100644 --- a/src/Sastrawi/Morphology/InvalidAffixPairSpecification.py +++ b/src/Sastrawi/Morphology/InvalidAffixPairSpecification.py @@ -5,14 +5,14 @@ class InvalidAffixPairSpecification(object): @link http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf """ - def isSatisfiedBy(self, word): + def is_satisfied_by(self, word): if re.match(r'^me(.*)kan$', word): return False if word == 'ketahui': return False - invalidAffixes = [r'^ber(.*)i$', + invalid_affixes = [r'^ber(.*)i$', r'^di(.*)an$', r'^ke(.*)i$', r'^ke(.*)an$', @@ -22,7 +22,7 @@ def isSatisfiedBy(self, word): r'^per(.*)an$'] contains = False - for invalidAffix in invalidAffixes: + for invalidAffix in invalid_affixes: contains = contains or re.match(invalidAffix, word) return contains diff --git a/src/Sastrawi/Stemmer/CachedStemmer.py b/src/Sastrawi/Stemmer/CachedStemmer.py index ad9f030..97258bc 100644 --- a/src/Sastrawi/Stemmer/CachedStemmer.py +++ b/src/Sastrawi/Stemmer/CachedStemmer.py @@ -8,7 +8,7 @@ def __init__(self, cache, delegatedStemmer): self.delegatedStemmer = delegatedStemmer def stem(self, text): - normalizedText = TextNormalizer.normalizeText(text) + normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') stems = [] @@ -23,5 +23,5 @@ def stem(self, text): return ' '.join(stems) - def getCache(self): + def get_cache(self): return self.cache diff --git a/src/Sastrawi/Stemmer/ConfixStripping/PrecedenceAdjustmentSpecification.py b/src/Sastrawi/Stemmer/ConfixStripping/PrecedenceAdjustmentSpecification.py index 9b42266..6219e05 100644 --- a/src/Sastrawi/Stemmer/ConfixStripping/PrecedenceAdjustmentSpecification.py +++ b/src/Sastrawi/Stemmer/ConfixStripping/PrecedenceAdjustmentSpecification.py @@ -7,8 +7,8 @@ class PrecedenceAdjustmentSpecification(object): @link http://researchbank.rmit.edu.au/eserv/rmit:6312/Asian.pdf """ - def isSatisfiedBy(self, value): - regexRules = [ + def is_satisfied_by(self, value): + regex_rules = [ r'^be(.*)lah$', r'^be(.*)an$', r'^me(.*)i$', @@ -17,7 +17,7 @@ def isSatisfiedBy(self, value): r'^ter(.*)i$', ] - for rule in regexRules: + for rule in regex_rules: if re.match(rule, value): return True diff --git a/src/Sastrawi/Stemmer/Context/Context.py b/src/Sastrawi/Stemmer/Context/Context.py index 29abb41..18223ee 100644 --- a/src/Sastrawi/Stemmer/Context/Context.py +++ b/src/Sastrawi/Stemmer/Context/Context.py @@ -4,185 +4,161 @@ class Context(object): """Stemming Context using Nazief and Adriani, CS, ECS, Improved ECS""" - def __init__(self, originalWord, dictionary, visitorProvider): - self.originalWord = originalWord - self.currentWord = originalWord + def __init__(self, original_word, dictionary, visitor_provider): + self.original_word = original_word + self.current_word = original_word self.dictionary = dictionary - self.visitorProvider = visitorProvider + self.visitor_provider = visitor_provider - self.processIsStopped = False + self.process_is_stopped = False self.removals = [] self.visitors = [] - self.suffixVisitors = [] - self.prefixVisitors = [] + self.suffix_visitors = [] + self.prefix_pisitors = [] self.result = '' - self.initVisitors() + self.init_visitors() - def initVisitors(self): - self.visitors = self.visitorProvider.getVisitors() - self.suffixVisitors = self.visitorProvider.getSuffixVisitors() - self.prefixVisitors = self.visitorProvider.getPrefixVisitors() - - def setDictionary(self, dictionary): - self.dictionary = dictionary - - def getDictionary(self): - return self.dictionary - - def getOriginalWord(self): - return self.originalWord - - def setCurrentWord(self, word): - self.currentWord = word - - def getCurrentWord(self): - return self.currentWord + def init_visitors(self): + self.visitors = self.visitor_provider.get_visitors() + self.suffix_visitors = self.visitor_provider.get_suffix_visitors() + self.prefix_pisitors = self.visitor_provider.get_prefix_visitors() def stopProcess(self): - self.processIsStopped = True - - #def processIsStopped(self): - # return self.processIsStopped + self.process_is_stopped = True - def addRemoval(self, removal): + def add_removal(self, removal): self.removals.append(removal) - def getRemovals(self): - return self.removals - - def getResult(self): - return self.result - def execute(self): - """Execute stemming process; the result can be retrieved with getResult()""" + """Execute stemming process; the result can be retrieved with result""" #step 1 - 5 - self.startStemmingProcess() + self.start_stemming_process() #step 6 - if self.dictionary.contains(self.currentWord): - self.result = self.getCurrentWord() + if self.dictionary.contains(self.current_word): + self.result = self.current_word else: - self.result = self.originalWord + self.result = self.original_word - def startStemmingProcess(self): + def start_stemming_process(self): #step 1 - if self.dictionary.contains(self.currentWord): + if self.dictionary.contains(self.current_word): return - self.acceptVisitors(self.visitors) - if self.dictionary.contains(self.currentWord): + self.accept_visitors(self.visitors) + if self.dictionary.contains(self.current_word): return csPrecedenceAdjustmentSpecification = PrecedenceAdjustmentSpecification() #Confix Stripping #Try to remove prefix before suffix if the specification is met - if csPrecedenceAdjustmentSpecification.isSatisfiedBy(self.getOriginalWord()): + if csPrecedenceAdjustmentSpecification.is_satisfied_by(self.original_word): #step 4, 5 - self.removePrefixes() - if self.dictionary.contains(self.currentWord): + self.remove_prefixes() + if self.dictionary.contains(self.current_word): return #step 2, 3 - self.removeSuffixes() - if self.dictionary.contains(self.currentWord): + self.remove_suffixes() + if self.dictionary.contains(self.current_word): return else: #if the trial is failed, restore the original word #and continue to normal rule precedence (suffix first, prefix afterwards) - self.setCurrentWord(self.originalWord) + self.current_word = self.original_word self.removals = [] #step 2, 3 - self.removeSuffixes() - if self.dictionary.contains(self.currentWord): + self.remove_suffixes() + if self.dictionary.contains(self.current_word): return #step 4, 5 - self.removePrefixes() - if self.dictionary.contains(self.currentWord): + self.remove_prefixes() + if self.dictionary.contains(self.current_word): return #ECS loop pengembalian akhiran - self.loopPengembalianAkhiran() + self.loop_pengembalian_akhiran() - def removePrefixes(self): + def remove_prefixes(self): for i in range(3): - self.acceptPrefixVisitors(self.prefixVisitors) - if self.dictionary.contains(self.currentWord): + self.accept_prefix_visitors(self.prefix_pisitors) + if self.dictionary.contains(self.current_word): return - def removeSuffixes(self): - self.acceptVisitors(self.suffixVisitors) + def remove_suffixes(self): + self.accept_visitors(self.suffix_visitors) def accept(self, visitor): visitor.visit(self) - def acceptVisitors(self, visitors): + def accept_visitors(self, visitors): for visitor in visitors: self.accept(visitor) - if self.dictionary.contains(self.currentWord): - return self.getCurrentWord() - if self.processIsStopped: - return self.getCurrentWord() + if self.dictionary.contains(self.current_word): + return self.current_word + if self.process_is_stopped: + return self.current_word - def acceptPrefixVisitors(self, visitors): + def accept_prefix_visitors(self, visitors): removalCount = len(self.removals) for visitor in visitors: self.accept(visitor) - if self.dictionary.contains(self.currentWord): - return self.getCurrentWord() - if self.processIsStopped: - return self.getCurrentWord() + if self.dictionary.contains(self.current_word): + return self.current_word + if self.process_is_stopped: + return self.current_word if len(self.removals) > removalCount: return - def loopPengembalianAkhiran(self): + def loop_pengembalian_akhiran(self): """ECS Loop Pengembalian Akhiran""" - self.restorePrefix() + self.restore_prefix() removals = self.removals - reversedRemovals = reversed(removals) - currentWord = self.getCurrentWord() + reversed_removals = reversed(removals) + current_word = self.current_word - for removal in reversedRemovals: - if not self.isSuffixRemoval(removal): + for removal in reversed_removals: + if not self.is_suffix_removal(removal): continue - if removal.getRemovedPart() == 'kan': - self.setCurrentWord(removal.getResult() + 'k') + if removal.get_removed_part() == 'kan': + self.current_word = removal.result + 'k' #step 4,5 - self.removePrefixes() - if self.dictionary.contains(self.currentWord): + self.remove_prefixes() + if self.dictionary.contains(self.current_word): return - self.setCurrentWord(removal.getResult() + 'kan') + self.current_word = removal.result + 'kan' else: - self.setCurrentWord(removal.getSubject()) + self.current_word = removal.get_subject() #step 4,5 - self.removePrefixes() - if self.dictionary.contains(self.currentWord): + self.remove_prefixes() + if self.dictionary.contains(self.current_word): return self.removals = removals - self.setCurrentWord(currentWord) + self.current_word = current_word - def isSuffixRemoval(self, removal): + def is_suffix_removal(self, removal): """Check wether the removed part is a suffix""" - return removal.getAffixType() == 'DS' \ - or removal.getAffixType() == 'PP' \ - or removal.getAffixType() == 'P' + return removal.get_affix_type() == 'DS' \ + or removal.get_affix_type() == 'PP' \ + or removal.get_affix_type() == 'P' - def restorePrefix(self): + def restore_prefix(self): """Restore prefix to proceed with ECS loop pengembalian akhiran""" for removal in self.removals: #return the word before precoding (the subject of first prefix removal) - self.setCurrentWord(removal.getSubject()) + self.current_word = removal.get_subject() break for removal in self.removals: - if removal.getAffixType() == 'DP': + if removal.get_affix_type() == 'DP': self.removals.remove(removal) diff --git a/src/Sastrawi/Stemmer/Context/Removal.py b/src/Sastrawi/Stemmer/Context/Removal.py index 0cb3d49..0cdfed8 100644 --- a/src/Sastrawi/Stemmer/Context/Removal.py +++ b/src/Sastrawi/Stemmer/Context/Removal.py @@ -1,6 +1,6 @@ from Sastrawi.Stemmer.Context.RemovalInterface import RemovalInterface -class Removal(RemovalInterface): +class Removal(object): """description of class""" def __init__(self, visitor, subject, result, removedPart, affixType): @@ -10,19 +10,19 @@ def __init__(self, visitor, subject, result, removedPart, affixType): self.removedPart = removedPart self.affixType = affixType - def getVisitor(self): + def get_visitor(self): return self.visitor - def getSubject(self): + def get_subject(self): return self.subject - def getResult(self): + def get_result(self): return self.result - def getRemovedPart(self): + def get_removed_part(self): return self.removedPart - def getAffixType(self): + def get_affix_type(self): return self.affixType diff --git a/src/Sastrawi/Stemmer/Context/RemovalInterface.py b/src/Sastrawi/Stemmer/Context/RemovalInterface.py index 0c332b1..93b6171 100644 --- a/src/Sastrawi/Stemmer/Context/RemovalInterface.py +++ b/src/Sastrawi/Stemmer/Context/RemovalInterface.py @@ -1,19 +1,19 @@ class RemovalInterface(object): """description of class""" - def getVisitor(self): + def get_visitor(self): pass - def getSubject(self): + def get_subject(self): pass - def getResult(self): + def get_result(self): pass - def getRemovedPart(self): + def get_removed_part(self): pass - def getAffixType(self): + def get_affix_type(self): pass diff --git a/src/Sastrawi/Stemmer/Context/Visitor/AbstractDisambiguatePrefixRule.py b/src/Sastrawi/Stemmer/Context/Visitor/AbstractDisambiguatePrefixRule.py index 222f0ff..2cd19fe 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/AbstractDisambiguatePrefixRule.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/AbstractDisambiguatePrefixRule.py @@ -11,25 +11,25 @@ def visit(self, context): result = None for disambiguator in self.disambiguators: - result = disambiguator.disambiguate(context.getCurrentWord()) - if context.getDictionary().contains(result): + result = disambiguator.disambiguate(context.current_word) + if context.dictionary.contains(result): break if not result: return - removedPart = re.sub(result, '', context.getCurrentWord(), 1) + removedPart = re.sub(result, '', context.current_word, 1) - removal = Removal(self, context.getCurrentWord(), result, removedPart, 'DP') + removal = Removal(self, context.current_word, result, removedPart, 'DP') - context.addRemoval(removal) - context.setCurrentWord(result) + context.add_removal(removal) + context.current_word = result - def addDisambiguators(self, disambiguators): + def add_disambiguators(self, disambiguators): for disambiguator in disambiguators: - self.addDisambiguator(disambiguator) + self.add_disambiguator(disambiguator) - def addDisambiguator(self, disambiguator): + def add_disambiguator(self, disambiguator): self.disambiguators.append(disambiguator) diff --git a/src/Sastrawi/Stemmer/Context/Visitor/DontStemShortWord.py b/src/Sastrawi/Stemmer/Context/Visitor/DontStemShortWord.py index deab39f..f002d26 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/DontStemShortWord.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/DontStemShortWord.py @@ -2,10 +2,10 @@ class DontStemShortWord(object): """description of class""" def visit(self, context): - if self.isShortWord(context.getCurrentWord()): + if self.is_whort_word(context.current_word): context.stopProcess() - def isShortWord(self, word): + def is_whort_word(self, word): return len(word) <= 3 diff --git a/src/Sastrawi/Stemmer/Context/Visitor/PrefixDisambiguator.py b/src/Sastrawi/Stemmer/Context/Visitor/PrefixDisambiguator.py index 33ea833..1d24185 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/PrefixDisambiguator.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/PrefixDisambiguator.py @@ -6,7 +6,7 @@ class PrefixDisambiguator(AbstractDisambiguatePrefixRule): def __init__(self, disambiguators): super(PrefixDisambiguator, self).__init__() - self.addDisambiguators(disambiguators) + self.add_disambiguators(disambiguators) diff --git a/src/Sastrawi/Stemmer/Context/Visitor/RemoveDerivationalSuffix.py b/src/Sastrawi/Stemmer/Context/Visitor/RemoveDerivationalSuffix.py index b61c498..889c5a5 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/RemoveDerivationalSuffix.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/RemoveDerivationalSuffix.py @@ -9,14 +9,14 @@ class RemoveDerivationalSuffix(object): """ def visit(self, context): - result = self.remove(context.getCurrentWord()) - if result != context.getCurrentWord(): - removedPart = re.sub(result, '', context.getCurrentWord(), 1) + result = self.remove(context.current_word) + if result != context.current_word: + removedPart = re.sub(result, '', context.current_word, 1) - removal = Removal(self, context.getCurrentWord(), result, removedPart, 'DS') + removal = Removal(self, context.current_word, result, removedPart, 'DS') - context.addRemoval(removal) - context.setCurrentWord(result) + context.add_removal(removal) + context.current_word = result def remove(self, word): """Remove derivational suffix diff --git a/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalParticle.py b/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalParticle.py index d37b05e..cb2df54 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalParticle.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalParticle.py @@ -9,14 +9,14 @@ class RemoveInflectionalParticle(object): """ def visit(self, context): - result = self.remove(context.getCurrentWord()) - if result != context.getCurrentWord(): - removedPart = re.sub(result, '', context.getCurrentWord(), 1) + result = self.remove(context.current_word) + if result != context.current_word: + removedPart = re.sub(result, '', context.current_word, 1) - removal = Removal(self, context.getCurrentWord(), result, removedPart, 'P') + removal = Removal(self, context.current_word, result, removedPart, 'P') - context.addRemoval(removal) - context.setCurrentWord(result) + context.add_removal(removal) + context.current_word = result def remove(self, word): """Remove inflectional particle : lah|kah|tah|pun""" diff --git a/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalPossessivePronoun.py b/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalPossessivePronoun.py index bcdc21f..69aafe0 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalPossessivePronoun.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/RemoveInflectionalPossessivePronoun.py @@ -9,14 +9,14 @@ class RemoveInflectionalPossessivePronoun(object): """ def visit(self, context): - result = self.remove(context.getCurrentWord()) - if result != context.getCurrentWord(): - removedPart = re.sub(result, '', context.getCurrentWord(), 1) + result = self.remove(context.current_word) + if result != context.current_word: + removedPart = re.sub(result, '', context.current_word, 1) - removal = Removal(self, context.getCurrentWord(), result, removedPart, 'PP') + removal = Removal(self, context.current_word, result, removedPart, 'PP') - context.addRemoval(removal) - context.setCurrentWord(result) + context.add_removal(removal) + context.current_word = result def remove(self, word): """Remove inflectional possessive pronoun : ku|mu|nya|-ku|-mu|-nya""" diff --git a/src/Sastrawi/Stemmer/Context/Visitor/RemovePlainPrefix.py b/src/Sastrawi/Stemmer/Context/Visitor/RemovePlainPrefix.py index 00c5410..60df9ed 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/RemovePlainPrefix.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/RemovePlainPrefix.py @@ -9,14 +9,14 @@ class RemovePlainPrefix(object): """ def visit(self, context): - result = self.remove(context.getCurrentWord()) - if result != context.getCurrentWord(): - removedPart = re.sub(result, '', context.getCurrentWord(), 1) + result = self.remove(context.current_word) + if result != context.current_word: + removedPart = re.sub(result, '', context.current_word, 1) - removal = Removal(self, context.getCurrentWord(), result, removedPart, 'DP') + removal = Removal(self, context.current_word, result, removedPart, 'DP') - context.addRemoval(removal) - context.setCurrentWord(result) + context.add_removal(removal) + context.current_word = result def remove(self, word): """Remove plain prefix : di|ke|se""" diff --git a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py index 1be0b36..0ce6b33 100644 --- a/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py +++ b/src/Sastrawi/Stemmer/Context/Visitor/VisitorProvider.py @@ -51,86 +51,86 @@ class VisitorProvider(object): def __init__(self): self.visitors = [] - self.suffixVisitors = [] - self.prefixVisitors = [] + self.suffix_visitors = [] + self.prefix_pisitors = [] - self.initVisitors() + self.init_visitors() - def initVisitors(self): + def init_visitors(self): self.visitors.append(DontStemShortWord()) #{lah|kah|tah|pun} - self.suffixVisitors.append(RemoveInflectionalParticle()) + self.suffix_visitors.append(RemoveInflectionalParticle()) #{ku|mu|nya} - self.suffixVisitors.append(RemoveInflectionalPossessivePronoun()) + self.suffix_visitors.append(RemoveInflectionalPossessivePronoun()) #{i|kan|an} - self.suffixVisitors.append(RemoveDerivationalSuffix()) + self.suffix_visitors.append(RemoveDerivationalSuffix()) #{di|ke|se} - self.prefixVisitors.append(RemovePlainPrefix()) - - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule1a(), DisambiguatorPrefixRule1b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule2()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule3()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule4()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule5()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule6a(), DisambiguatorPrefixRule6b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule7()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule8()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule9()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule10()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule11()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule12()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule13a(), DisambiguatorPrefixRule13b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule14()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule15a(), DisambiguatorPrefixRule15b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule16()])) + self.prefix_pisitors.append(RemovePlainPrefix()) + + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule1a(), DisambiguatorPrefixRule1b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule2()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule3()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule4()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule5()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule6a(), DisambiguatorPrefixRule6b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule7()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule8()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule9()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule10()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule11()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule12()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule13a(), DisambiguatorPrefixRule13b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule14()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule15a(), DisambiguatorPrefixRule15b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule16()])) disambiguators17 = [DisambiguatorPrefixRule17a(), DisambiguatorPrefixRule17b(), \ DisambiguatorPrefixRule17c(), DisambiguatorPrefixRule17d()] - self.prefixVisitors.append(PrefixDisambiguator(disambiguators17)) - - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule18a(), DisambiguatorPrefixRule18b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule19()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule20()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule21a(), DisambiguatorPrefixRule21b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule23()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule24()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule25()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule26a(), DisambiguatorPrefixRule26b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule27()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule28a(), DisambiguatorPrefixRule28b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule29()])) + self.prefix_pisitors.append(PrefixDisambiguator(disambiguators17)) + + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule18a(), DisambiguatorPrefixRule18b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule19()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule20()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule21a(), DisambiguatorPrefixRule21b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule23()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule24()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule25()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule26a(), DisambiguatorPrefixRule26b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule27()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule28a(), DisambiguatorPrefixRule28b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule29()])) disambiguators30 = [DisambiguatorPrefixRule30a(), DisambiguatorPrefixRule30b(), \ DisambiguatorPrefixRule30c()] - self.prefixVisitors.append(PrefixDisambiguator(disambiguators30)) + self.prefix_pisitors.append(PrefixDisambiguator(disambiguators30)) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule31a(), DisambiguatorPrefixRule31b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule32()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule34()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule31a(), DisambiguatorPrefixRule31b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule32()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule34()])) #CS additional rules - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule35()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule36()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule35()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule36()])) #CS infix rules - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule37a(), DisambiguatorPrefixRule37b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule38a(), DisambiguatorPrefixRule38b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule39a(), DisambiguatorPrefixRule39b()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule40a(), DisambiguatorPrefixRule40b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule37a(), DisambiguatorPrefixRule37b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule38a(), DisambiguatorPrefixRule38b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule39a(), DisambiguatorPrefixRule39b()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule40a(), DisambiguatorPrefixRule40b()])) #Sastrawi rules #ku-A, kau-A - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule41()])) - self.prefixVisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule42()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule41()])) + self.prefix_pisitors.append(PrefixDisambiguator([DisambiguatorPrefixRule42()])) - def getVisitors(self): + def get_visitors(self): return self.visitors - def getSuffixVisitors(self): - return self.suffixVisitors + def get_suffix_visitors(self): + return self.suffix_visitors - def getPrefixVisitors(self): - return self.prefixVisitors + def get_prefix_visitors(self): + return self.prefix_pisitors diff --git a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py index b0755d1..2d72e27 100644 --- a/src/Sastrawi/Stemmer/Filter/TextNormalizer.py +++ b/src/Sastrawi/Stemmer/Filter/TextNormalizer.py @@ -1,6 +1,6 @@ import re -def normalizeText(text): +def normalize_text(text): result = str.lower(text) result = re.sub(r'[^a-z0-9 -]', ' ', result, flags = re.IGNORECASE|re.MULTILINE) result = re.sub(r'( +)', ' ', result, flags = re.IGNORECASE|re.MULTILINE) diff --git a/src/Sastrawi/Stemmer/Stemmer.py b/src/Sastrawi/Stemmer/Stemmer.py index 41f1cd1..cb196c7 100644 --- a/src/Sastrawi/Stemmer/Stemmer.py +++ b/src/Sastrawi/Stemmer/Stemmer.py @@ -11,31 +11,31 @@ class Stemmer(object): """ def __init__(self, dictionary): self.dictionary = dictionary - self.visitorProvider = VisitorProvider() + self.visitor_provider = VisitorProvider() - def getDictionary(self): + def get_dictionary(self): return self.dictionary def stem(self, text): """Stem a text string to its common stem form.""" - normalizedText = TextNormalizer.normalizeText(text) + normalizedText = TextNormalizer.normalize_text(text) words = normalizedText.split(' ') stems = [] for word in words: - stems.append(self.stemWord(word)) + stems.append(self.stem_word(word)) return ' '.join(stems) - def stemWord(self, word): + def stem_word(self, word): """Stem a word to its common stem form.""" - if self.isPlural(word): - return self.stemPluralWord(word) + if self.is_plural(word): + return self.stem_plural_word(word) else: - return self.stemSingularWord(word) + return self.stem_singular_word(word) - def isPlural(self, word): + def is_plural(self, word): #-ku|-mu|-nya #nikmat-Ku, etc matches = re.match(r'^(.*)-(ku|mu|nya|lah|kah|tah|pun)$', word) @@ -44,7 +44,7 @@ def isPlural(self, word): return word.find('-') != -1 - def stemPluralWord(self, plural): + def stem_plural_word(self, plural): """Stem a plural word to its common stem form. Asian J. (2007) "Effective Techniques for Indonesian Text Retrieval" page 76-77. @@ -66,22 +66,22 @@ def stemPluralWord(self, plural): words[1] = matches.group(2) + '-' + suffix #berbalas-balasan -> balas - rootWord1 = self.stemSingularWord(words[0]) - rootWord2 = self.stemSingularWord(words[1]) + rootWord1 = self.stem_singular_word(words[0]) + rootWord2 = self.stem_singular_word(words[1]) #meniru-nirukan -> tiru if not self.dictionary.contains(words[1]) and rootWord2 == words[1]: - rootWord2 = self.stemSingularWord('me' + words[1]) + rootWord2 = self.stem_singular_word('me' + words[1]) if rootWord1 == rootWord2: return rootWord1 else: return plural - def stemSingularWord(self, word): + def stem_singular_word(self, word): """Stem a singular word to its common stem form.""" - context = Context(word, self.dictionary, self.visitorProvider) + context = Context(word, self.dictionary, self.visitor_provider) context.execute() - return context.getResult() + return context.result diff --git a/src/Sastrawi/Stemmer/StemmerFactory.py b/src/Sastrawi/Stemmer/StemmerFactory.py index 6bf2780..f01f80f 100644 --- a/src/Sastrawi/Stemmer/StemmerFactory.py +++ b/src/Sastrawi/Stemmer/StemmerFactory.py @@ -8,10 +8,10 @@ class StemmerFactory(object): """ Stemmer factory helps creating pre-configured stemmer """ APC_KEY = 'sastrawi_cache_dictionary' - def createStemmer(self, isDev=False): + def create_stemmer(self, isDev=False): """ Returns Stemmer instance """ - words = self.getWords(isDev) + words = self.get_words(isDev) dictionary = ArrayDictionary(words) stemmer = Stemmer(dictionary) @@ -20,7 +20,7 @@ def createStemmer(self, isDev=False): return cachedStemmer - def getWords(self, isDev=False): + def get_words(self, isDev=False): #if isDev or callable(getattr(self, 'apc_fetch')): # words = self.getWordsFromFile() #else: @@ -28,9 +28,9 @@ def getWords(self, isDev=False): # if not words: # words = self.getWordsFromFile() # apc_store(self.APC_KEY, words) - return self.getWordsFromFile() + return self.get_words_from_file() - def getWordsFromFile(self): + def get_words_from_file(self): current_dir = os.path.dirname(os.path.realpath(__file__)) dictionaryFile = current_dir + '/../../../data/kata-dasar.txt' if not os.path.isfile(dictionaryFile): diff --git a/src/Sastrawi/StopWordRemover/StopWordRemover.py b/src/Sastrawi/StopWordRemover/StopWordRemover.py index d4c834e..b3b2f25 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemover.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemover.py @@ -4,7 +4,7 @@ class StopWordRemover(object): def __init__(self, dictionary): self.dictionary = dictionary - def getDictionary(self): + def get_dictionary(self): return self.dictionary def remove(self, text): diff --git a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py index 903c490..5b35049 100644 --- a/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py +++ b/src/Sastrawi/StopWordRemover/StopWordRemoverFactory.py @@ -4,14 +4,14 @@ class StopWordRemoverFactory(object): """description of class""" - def createStopWordRemover(self): - stopWords = self.getStopWords() + def create_stop_word_remover(self): + stopWords = self.get_stop_words() dictionary = ArrayDictionary(stopWords) stopWordRemover = StopWordRemover(dictionary) return stopWordRemover - def getStopWords(self): + def get_stop_words(self): return ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 'dia', 'dua', 'ia', 'seperti', 'jika', 'jika', 'sehingga', 'kembali', 'dan', 'tidak', 'ini', 'karena', 'kepada', 'oleh', 'saat', 'harus', 'sementara', 'setelah', 'belum', 'kami', 'sekitar', diff --git a/tests/FunctionalTests/Stemmer/StemmerTest.py b/tests/FunctionalTests/Stemmer/StemmerTest.py index 8852959..cc21eb3 100644 --- a/tests/FunctionalTests/Stemmer/StemmerTest.py +++ b/tests/FunctionalTests/Stemmer/StemmerTest.py @@ -29,10 +29,10 @@ def setUp(self): self.stemmer = Stemmer(self.dictionary) return super(Test_StemmerTest, self).setUp() - def tryStem(self, word, stem): + def try_stem(self, word, stem): self.assertEquals(stem, self.stemmer.stem(word)) - def getTestData(self): + def get_test_data(self): data = [] # don't stem short words @@ -367,9 +367,9 @@ def getTestData(self): return data def test_All(self): - data = self.getTestData() + data = self.get_test_data() for d in data: - self.tryStem(d[0], d[1]) + self.try_stem(d[0], d[1]) if __name__ == '__main__': diff --git a/tests/IntegrationTests/Stemmer/StemmerTest.py b/tests/IntegrationTests/Stemmer/StemmerTest.py index 119c694..a5cc325 100644 --- a/tests/IntegrationTests/Stemmer/StemmerTest.py +++ b/tests/IntegrationTests/Stemmer/StemmerTest.py @@ -6,10 +6,10 @@ class Test_StemmerTest(unittest.TestCase): def setUp(self): stemmerFactory = StemmerFactory() - self.stemmer = stemmerFactory.createStemmer() + self.stemmer = stemmerFactory.create_stemmer() return super(Test_StemmerTest, self).setUp() - def getTestData(self): + def get_test_data(self): data = [] data.append(['kebijakan', 'bijak']) @@ -52,13 +52,13 @@ def getTestData(self): return data - def tryStem(self, word, stem): + def try_stem(self, word, stem): self.assertEquals(stem, self.stemmer.stem(word)) def test_stem(self): - data = self.getTestData() + data = self.get_test_data() for d in data: - self.tryStem(d[0], d[1]) + self.try_stem(d[0], d[1]) if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Dictionary/ArrayDictionaryTest.py b/tests/UnitTests/Dictionary/ArrayDictionaryTest.py index cff52d9..57d72fc 100644 --- a/tests/UnitTests/Dictionary/ArrayDictionaryTest.py +++ b/tests/UnitTests/Dictionary/ArrayDictionaryTest.py @@ -7,29 +7,29 @@ def setUp(self): self.dictionary = ArrayDictionary() return super(Test_ArrayDictionaryTest, self).setUp() - def test_addAndContain(self): + def test_add_and_contain(self): self.assertFalse(self.dictionary.contains('word')) self.dictionary.add('word') self.assertTrue(self.dictionary.contains('word')) - def test_addCountWord(self): + def test_add_count_word(self): self.assertEquals(0, self.dictionary.count()) self.dictionary.add('word') self.assertEquals(1, self.dictionary.count()) - def test_addWordIgnoreEmptyString(self): + def test_add_word_ignore_empty_string(self): self.assertEquals(0, self.dictionary.count()) self.dictionary.add('') self.assertEquals(0, self.dictionary.count()) - def test_addWords(self): + def test_add_words(self): words = ['word1', 'word2'] - self.dictionary.addWords(words) + self.dictionary.add_words(words) self.assertEquals(2, self.dictionary.count()) self.assertTrue(self.dictionary.contains('word1')) self.assertTrue(self.dictionary.contains('word2')) - def test_constructorPreserveWords(self): + def test_constructor_preserve_words(self): words = ['word1', 'word2'] dictionary = ArrayDictionary(words) self.assertEquals(2, dictionary.count()) diff --git a/tests/UnitTests/Morphology/InvalidAffixPairSpecificationTest.py b/tests/UnitTests/Morphology/InvalidAffixPairSpecificationTest.py index 769933d..14dddff 100644 --- a/tests/UnitTests/Morphology/InvalidAffixPairSpecificationTest.py +++ b/tests/UnitTests/Morphology/InvalidAffixPairSpecificationTest.py @@ -7,16 +7,16 @@ def setUp(self): return super(Test_InvalidAffixPairSpecificationTest, self).setUp() def test_containsInvalidAffixPair(self): - self.assertFalse(self.specification.isSatisfiedBy('memberikan')) - self.assertFalse(self.specification.isSatisfiedBy('ketahui')) + self.assertFalse(self.specification.is_satisfied_by('memberikan')) + self.assertFalse(self.specification.is_satisfied_by('ketahui')) - self.assertTrue(self.specification.isSatisfiedBy('berjatuhi')) - self.assertTrue(self.specification.isSatisfiedBy('dipukulan')) - self.assertTrue(self.specification.isSatisfiedBy('ketiduri')) - self.assertTrue(self.specification.isSatisfiedBy('ketidurkan')) - self.assertTrue(self.specification.isSatisfiedBy('menduaan')) - self.assertTrue(self.specification.isSatisfiedBy('terduaan')) - self.assertTrue(self.specification.isSatisfiedBy('perkataan')) + self.assertTrue(self.specification.is_satisfied_by('berjatuhi')) + self.assertTrue(self.specification.is_satisfied_by('dipukulan')) + self.assertTrue(self.specification.is_satisfied_by('ketiduri')) + self.assertTrue(self.specification.is_satisfied_by('ketidurkan')) + self.assertTrue(self.specification.is_satisfied_by('menduaan')) + self.assertTrue(self.specification.is_satisfied_by('terduaan')) + self.assertTrue(self.specification.is_satisfied_by('perkataan')) if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/StemmerFactoryTest.py b/tests/UnitTests/Stemmer/StemmerFactoryTest.py index 22879d5..146df28 100644 --- a/tests/UnitTests/Stemmer/StemmerFactoryTest.py +++ b/tests/UnitTests/Stemmer/StemmerFactoryTest.py @@ -8,13 +8,13 @@ def setUp(self): return super(Test_StemmerFactoryTest, self).setUp() def test_createStemmerReturnStemmer(self): - stemmer = self.factory.createStemmer() + stemmer = self.factory.create_stemmer() self.assertIsNotNone(stemmer) #self.assertIsInstance(stemmer, Stemmer) def test_fungsional(self): factory = StemmerFactory() - stemmer = factory.createStemmer() + stemmer = factory.create_stemmer() sentence = 'malaikat-malaikat-Nya' expected = 'malaikat' @@ -25,7 +25,7 @@ def test_fungsional(self): def test_getWordsFromFile(self): factory = StemmerFactory() - factory.getWordsFromFile() + factory.get_words_from_file() if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/Stemmer/StemmerTest.py b/tests/UnitTests/Stemmer/StemmerTest.py index 6e0fda6..77d152c 100644 --- a/tests/UnitTests/Stemmer/StemmerTest.py +++ b/tests/UnitTests/Stemmer/StemmerTest.py @@ -19,9 +19,9 @@ def test_StemReturnImmediatelyIfFoundOnDictionary(self): """To prevent overstemming : nilai could have been overstemmed to nila if we don't lookup against the dictionary """ - self.stemmer.getDictionary().add('nila') + self.stemmer.get_dictionary().add('nila') self.assertEquals('nila', self.stemmer.stem('nilai')) - self.stemmer.getDictionary().add('nilai') + self.stemmer.get_dictionary().add('nilai') self.assertEquals('nilai', self.stemmer.stem('nilai')) if __name__ == '__main__': diff --git a/tests/UnitTests/StopWordRemover/StopWordRemoverFactoryTest.py b/tests/UnitTests/StopWordRemover/StopWordRemoverFactoryTest.py index 294c749..5f758f8 100644 --- a/tests/UnitTests/StopWordRemover/StopWordRemoverFactoryTest.py +++ b/tests/UnitTests/StopWordRemover/StopWordRemoverFactoryTest.py @@ -8,7 +8,7 @@ def setUp(self): return super(Test_StopWordRemoverFactoryTest, self).setUp() def test_createStopWordRemover(self): - self.assertIsInstance(self.factory.createStopWordRemover(), StopWordRemover) + self.assertIsInstance(self.factory.create_stop_word_remover(), StopWordRemover) if __name__ == '__main__': unittest.main() diff --git a/tests/UnitTests/StopWordRemover/StopWordRemoverTest.py b/tests/UnitTests/StopWordRemover/StopWordRemoverTest.py index e449557..ff36116 100644 --- a/tests/UnitTests/StopWordRemover/StopWordRemoverTest.py +++ b/tests/UnitTests/StopWordRemover/StopWordRemoverTest.py @@ -9,7 +9,7 @@ def setUp(self): return super(Test_StopWordRemoverTest, self).setUp() def test_getDictionaryPreserveInstance(self): - self.assertEqual(self.dictionary, self.stopWordRemover.getDictionary()) + self.assertEqual(self.dictionary, self.stopWordRemover.get_dictionary()) def test_removeStopWord(self): self.assertEquals('pergi sekolah', self.stopWordRemover.remove('pergi ke sekolah'))