* Move morphological analysis into its own module, morphology.pyx

2025-06-29 01:13:17 +03:00 · 2014-12-09 21:16:17 +11:00 · 2014-12-09 21:16:17 +11:00 · 6b34a2f34b
commit 6b34a2f34b
parent b962fe73d7
7 changed files with 135 additions and 97 deletions
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -35,8 +35,8 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
-from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
+from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
-from .tagger cimport X, PUNCT, EOL
+from .morphology cimport X, PUNCT, EOL
 from .tokens cimport Morphology
@ -154,8 +154,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context)
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            if self.morphologizer:
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+                self.morphologizer.set_morph(i, t)
    def train_pos(self, Tokens tokens, golds):
        cdef int i
@ -165,8 +165,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            if self.morphologizer:
-            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
+                self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -2,15 +2,15 @@ from libcpp.vector cimport vector
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
-from preshed.maps cimport PreshMap, PreshMapArray
+from preshed.maps cimport PreshMap
 from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
 from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
 from .morphology cimport Morphologizer
 cdef union LexemesOrTokens:
@ -40,17 +40,14 @@ cdef class Language:
    cdef readonly unicode name
    cdef PreshMap _cache
    cdef PreshMap _specials
    cdef PreshMapArray _lemmas
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
-    cpdef readonly object lemmatizer
+    cpdef readonly Morphologizer morphologizer
    cdef object _prefix_re
    cdef object _suffix_re
    cdef object _infix_re
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
 from .lemmatizer import Lemmatizer
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
 from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
 from .tokens cimport Morphology
@ -43,39 +40,16 @@ cdef class Language:
        self._infix_re = re.compile(infix)
        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
        self._lemmas = PreshMapArray(N_UNIV_TAGS)
        self.pos_tagger = None
-        self.lemmatizer = None
+        self.morphologizer = None
    def load(self):
        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
-
+            self.morphologizer = Morphologizer(self.lexicon.strings,
-    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
+                                    path.join(util.DATA_DIR, self.name))
        if self.lemmatizer is None:
            return lex.sic
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.lexicon.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
        if pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
        self._lemmas.set(pos, lex.sic, <void*>lemma)
        return lemma
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -0,0 +1,42 @@
 from .tokens cimport TokenC, Morphology
 from .lexeme cimport Lexeme
 from .utf8string cimport StringStore
 from preshed.maps cimport PreshMapArray
 from cymem.cymem cimport Pool
 # Google universal tag set
 cpdef enum univ_tag_t:
    NO_TAG
    ADJ
    ADV
    ADP
    CONJ
    DET
    NOUN
    NUM
    PRON
    PRT
    VERB
    X
    PUNCT
    EOL
    N_UNIV_TAGS
 cdef struct PosTag:
    Morphology morph
    int id
    univ_tag_t pos
 cdef class Morphologizer:
    cdef Pool mem
    cdef StringStore strings
    cdef object lemmatizer
    cdef PosTag* tags
    cdef PreshMapArray _morph
    cdef PreshMapArray _lemmas
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
    cdef int set_morph(self, const int i, TokenC* tokens) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -0,0 +1,81 @@
 from os import path
 import json
 from .lemmatizer import Lemmatizer
 UNIV_TAGS = {
    'NULL': NO_TAG,
    'ADJ': ADJ,
    'ADV': ADV,
    'ADP': ADP,
    'CONJ': CONJ,
    'DET': DET,
    'NOUN': NOUN,
    'NUM': NUM,
    'PRON': PRON,
    'PRT': PRT,
    'VERB': VERB,
    'X': X,
    '.': PUNCT,
    'EOL': EOL
 }
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
    def __init__(self, StringStore strings, data_dir):
        self.mem = Pool()
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        tag_map = cfg['tag_map']
        tag_names = cfg['tag_names']
        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
        self._lemmas = PreshMapArray(N_UNIV_TAGS)
        self._morph = PreshMapArray(len(tag_names))
        self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
        for i, tag in enumerate(tag_names):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
            self.tags[i].morph.number = props.get('number', 0)
            self.tags[i].morph.tenspect = props.get('tenspect', 0)
            self.tags[i].morph.mood = props.get('mood', 0)
            self.tags[i].morph.gender = props.get('gender', 0)
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
        if pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
        self._lemmas.set(pos, lex.sic, <void*>lemma)
        return lemma
    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
        cdef const PosTag* tag = &self.tags[tokens[i].pos]
        tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
        morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
        if morph is NULL:
            self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
            tokens[i].morph = tag.morph
        else:
            tokens[i].morph = morph[0]
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -12,31 +12,6 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, Morphology
 # Google universal tag set
 cdef enum univ_tag_t:
    NO_TAG
    ADJ
    ADV
    ADP
    CONJ
    DET
    NOUN
    NUM
    PRON
    PRT
    VERB
    X
    PUNCT
    EOL
    N_UNIV_TAGS
 cdef struct PosTag:
    Morphology morph
    int id
    univ_tag_t pos
 cdef class Tagger:
    cdef class_t predict(self, const atom_t* context, object golds=*) except *
@ -45,5 +20,4 @@ cdef class Tagger:
    cpdef readonly LinearModel model
    cpdef readonly list tag_names
    cdef PosTag* tags
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -34,23 +34,10 @@ cdef class Tagger:
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        tag_map = cfg['tag_map']
        univ_counts = {}
        cdef unicode tag
        cdef unicode univ_tag
        self.tag_names = cfg['tag_names']
        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
        for i, tag in enumerate(self.tag_names):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
            self.tags[i].morph.number = props.get('number', 0)
            self.tags[i].morph.tenspect = props.get('tenspect', 0)
            self.tags[i].morph.mood = props.get('mood', 0)
            self.tags[i].morph.gender = props.get('gender', 0)
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
@ -85,23 +72,6 @@ cdef class Tagger:
        return tag_id
 UNIV_TAGS = {
    'NULL': NO_TAG,
    'ADJ': ADJ,
    'ADV': ADV,
    'ADP': ADP,
    'CONJ': CONJ,
    'DET': DET,
    'NOUN': NOUN,
    'NUM': NUM,
    'PRON': PRON,
    'PRT': PRT,
    'VERB': VERB,
    'X': X,
    '.': PUNCT,
    'EOL': EOL
 }
 def _make_tag_dict(counts):
    freq_thresh = 50