Source code for alex.utils.czech_stemmer

#!/usr/bin/env python
# -*- coding: utf-8 -*-

''' Czech stemmer
Copyright © 2010 Luís Gomes <luismsgomes@gmail.com>.

Ported from the Java implementation available at:
    http://members.unine.ch/jacques.savoy/clef/index.html

'''
import re
import sys

[docs]def cz_stem_word(word, aggressive=False):
    if not re.match(u"^\\w+$", word):
        return word
    if not word.islower() and not word.istitle() and not word.isupper():
        return word
    s = word.lower() # all our pattern matching is done in lowercase
    s = _remove_case(s)
    s = _remove_possessives(s)
    if aggressive:
        s = _remove_comparative(s)
        s = _remove_diminutive(s)
        s = _remove_augmentative(s)
        s = _remove_derivational(s)
    if word.isupper():
        return s.upper()
    if word.istitle():
        return s.title()
    return s

[docs]def cz_stem(l, aggressive=False):
    if isinstance(l, str) or isinstance(l, unicode):
        return cz_stem_word(l)
    else:
        return [cz_stem(w) for w in l if w]

def _remove_case(word):
    if len(word) > 7 and word.endswith(u"atech"):
        return word[:-5]
    if len(word) > 6:
        if word.endswith(u"ětem"):
            return _palatalise(word[:-3])
        if word.endswith(u"atům"):
            return word[:-4]
    if len(word) > 5:
        if word[-3:] in {u"ech", u"ich", u"ích", u"ého", u"ěmi", u"emi", u"ému",
                         u"ete", u"eti", u"iho", u"ího", u"ími", u"imu"}:
            return _palatalise(word[:-2])
        if word[-3:] in {u"ách", u"ata", u"aty", u"ých", u"ama", u"ami",
                         u"ové", u"ovi", u"ými"}:
            return word[:-3]
    if len(word) > 4:
        if word.endswith(u"em"):
            return _palatalise(word[:-1])
        if word[-2:] in {u"es", u"ém", u"ím"}:
            return _palatalise(word[:-2])
        if word[-2:] in {u"ům", u"at", u"ám", u"os", u"us", u"ým", u"mi", u"ou"}:
            return word[:-2]
    if len(word) > 3:
        if word[-1] in u"eiíě":
            return _palatalise(word)
        if word[-1] in u"uyůaoáéý":
            return word[:-1]
    return word

def _remove_possessives(word):
    if len(word) > 5:
        if word[-2:] in {u"ov", u"ův"}:
            return word[:-2]
        if word.endswith(u"in"):
            return _palatalise(word[:-1])
    return word

def _remove_comparative(word):
    if len(word) > 5:
        if word[-3:] in {u"ejš", u"ějš"}:
            return _palatalise(word[:-2])
    return word

def _remove_diminutive(word):
    if len(word) > 7 and word.endswith(u"oušek"):
        return word[:-5]
    if len(word) > 6:
        if word[-4:] in {u"eček", u"éček", u"iček", u"íček", u"enek", u"ének",
                         u"inek", u"ínek"}:
            return _palatalise(word[:-3])
        if word[-4:] in {u"áček", u"aček", u"oček", u"uček", u"anek", u"onek",
                         u"unek", u"ánek"}:
            return _palatalise(word[:-4])
    if len(word) > 5:
        if word[-3:] in {u"ečk", u"éčk", u"ičk", u"íčk", u"enk", u"énk",
                         u"ink", u"ínk"}:
            return _palatalise(word[:-3])
        if word[-3:] in {u"áčk", u"ačk", u"očk", u"učk", u"ank", u"onk",
                         u"unk", u"átk", u"ánk", u"ušk"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {u"ek", u"ék", u"ík", u"ik"}:
            return _palatalise(word[:-1])
        if word[-2:] in {u"ák", u"ak", u"ok", u"uk"}:
            return word[:-1]
    if len(word) > 3 and word[-1] == u"k":
        return word[:-1]
    return word

def _remove_augmentative(word):
    if len(word) > 6 and word.endswith(u"ajzn"):
        return word[:-4]
    if len(word) > 5 and word[-3:] in {u"izn", u"isk"}:
        return _palatalise(word[:-2])
    if len(word) > 4 and word.endswith(u"ák"):
        return word[:-2]
    return word

def _remove_derivational(word):
    if len(word) > 8 and word.endswith(u"obinec"):
        return word[:-6]
    if len(word) > 7:
        if word.endswith(u"ionář"):
            return _palatalise(word[:-4])
        if word[-5:] in {u"ovisk", u"ovstv", u"ovišt", u"ovník"}:
            return word[:-5]
    if len(word) > 6:
        if word[-4:] in {u"ásek", u"loun", u"nost", u"teln", u"ovec", u"ovík",
                         u"ovtv", u"ovin", u"štin"}:
            return word[:-4]
        if word[-4:] in {u"enic", u"inec", u"itel"}:
            return _palatalise(word[:-3])
    if len(word) > 5:
        if word.endswith(u"árn"):
            return word[:-3]
        if word[-3:] in {u"ěnk", u"ián", u"ist", u"isk", u"išt", u"itb", u"írn"}:
            return _palatalise(word[:-2])
        if word[-3:] in {u"och", u"ost", u"ovn", u"oun", u"out", u"ouš",
                         u"ušk", u"kyn", u"čan", u"kář", u"néř", u"ník",
                         u"ctv", u"stv"}:
            return word[:-3]
    if len(word) > 4:
        if word[-2:] in {u"áč", u"ač", u"án", u"an", u"ář", u"as"}:
            return word[:-2]
        if word[-2:] in {u"ec", u"en", u"ěn", u"éř", u"íř", u"ic", u"in", u"ín",
                         u"it", u"iv"}:
            return _palatalise(word[:-1])
        if word[-2:] in {u"ob", u"ot", u"ov", u"oň", u"ul", u"yn", u"čk", u"čn",
                         u"dl", u"nk", u"tv", u"tk", u"vk"}:
            return word[:-2]
    if len(word) > 3 and word[-1] in u"cčklnt":
        return word[:-1]
    return word

def _palatalise(word):
    if word[-2:] in {u"ci", u"ce", u"či", u"če"}:
        return word[:-2] + u"k"

    if word[-2:] in {u"zi", u"ze", u"ži", u"že"}:
        return word[:-2] + u"h"

    if word[-3:] in {u"čtě", u"čti", u"čtí"}:
        return word[:-3] + u"ck"

    if word[-3:] in {u"ště", u"šti", u"ští"}:
        return word[:-3] + u"sk"
    return word[:-1]

if __name__ == '__main__':
    if len(sys.argv) != 2 or sys.argv[1] not in (u"light", u"aggressive"):
        sys.exit(u"usage: {} light|aggressive".format(sys.argv[0]))
    aggressive = sys.argv[1] == u"aggressive"
    for line in sys.stdin:
        print u" ".join([cz_stem(word, aggressive=aggressive)
                for word in line.split()])