* Improve efficiency of tagger, and improve morphological processing

2025-10-26 05:31:15 +03:00 · 2014-12-10 01:02:04 +11:00 · 2014-12-10 01:02:04 +11:00 · 42973c4b37
commit 42973c4b37
parent 6b34a2f34b
7 changed files with 83 additions and 61 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -125,23 +125,5 @@ cpdef enum:
    N_CONTEXT_FIELDS
 cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.sense
 cdef class English(Language):
    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -151,10 +151,14 @@ cdef class English(Language):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef TokenC* t = tokens.data
        assert self.morphologizer is not None
        cdef dict tagdict = self.pos_tagger.tagdict
        for i in range(tokens.length):
-            fill_pos_context(context, i, t)
+            if t[i].lex.sic in tagdict:
-            t[i].pos = self.pos_tagger.predict(context)
+                t[i].pos = tagdict[t[i].lex.sic]
-            if self.morphologizer:
+            else:
                fill_pos_context(context, i, t)
                t[i].pos = self.pos_tagger.predict(context)
                self.morphologizer.set_morph(i, t)
    def train_pos(self, Tokens tokens, golds):
@ -165,27 +169,27 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            if self.morphologizer:
+            self.morphologizer.set_morph(i, t)
                self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c
-cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
+cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
-    if tok_morph.number == 0:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
-        tok_morph.number = pos_morph.number
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    if tok_morph.tenspect == 0:
+    _fill_from_token(&context[W_sic], &tokens[i])
-        tok_morph.tenspect = pos_morph.tenspect
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    if tok_morph.mood == 0:
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
-        tok_morph.mood = pos_morph.mood
+
-    if tok_morph.gender == 0:
+
-        tok_morph.gender = pos_morph.gender
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    if tok_morph.person == 0:
+    context[0] = t.lex.sic
-        tok_morph.person = pos_morph.person
+    context[1] = t.lex.cluster
-    if tok_morph.case == 0:
+    context[2] = t.lex.shape
-        tok_morph.case = pos_morph.case
+    context[3] = t.lex.prefix
-    if tok_morph.misc == 0:
+    context[4] = t.lex.suffix
-        tok_morph.misc = pos_morph.misc
+    context[5] = t.pos
    context[6] = t.sense
 EN = English('en')
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -35,8 +35,8 @@ cdef class Morphologizer:
    cdef StringStore strings
    cdef object lemmatizer
    cdef PosTag* tags
    cdef readonly list tag_names
-    cdef PreshMapArray _morph
+    cdef PreshMapArray _cache
    cdef PreshMapArray _lemmas
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
    cdef int set_morph(self, const int i, TokenC* tokens) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,8 +1,10 @@
 # cython: profile=True
 # cython: embedsignature=True
 from os import path
 import json
 from .lemmatizer import Lemmatizer
-
+from .typedefs cimport id_t
 UNIV_TAGS = {
    'NULL': NO_TAG,
@ -22,6 +24,11 @@ UNIV_TAGS = {
 }
 cdef struct _Cached:
    Morphology morph
    int lemma
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
@ -30,12 +37,11 @@ cdef class Morphologizer:
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        tag_map = cfg['tag_map']
-        tag_names = cfg['tag_names']
+        self.tag_names = cfg['tag_names']
        self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
-        self._lemmas = PreshMapArray(N_UNIV_TAGS)
+        self._cache = PreshMapArray(len(self.tag_names))
-        self._morph = PreshMapArray(len(tag_names))
+        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
-        self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
+        for i, tag in enumerate(self.tag_names):
        for i, tag in enumerate(tag_names):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
@ -46,15 +52,15 @@ cdef class Morphologizer:
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
        if path.exists(path.join(data_dir, 'morph.json')):
            with open(path.join(data_dir, 'morph.json')) as file_:
                self.load_exceptions(json.loads(file_))
    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
@ -67,15 +73,45 @@ cdef class Morphologizer:
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
        self._lemmas.set(pos, lex.sic, <void*>lemma)
        return lemma
    cdef int set_morph(self, const int i, TokenC* tokens) except -1:
        cdef const PosTag* tag = &self.tags[tokens[i].pos]
-        tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
+        cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
-        morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
+        if cached is NULL:
-        if morph is NULL:
+            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
-            self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
+            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
-            tokens[i].morph = tag.morph
+            cached.morph = tag.morph
-        else:
+            self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
-            tokens[i].morph = morph[0]
+
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph
    def load_exceptions(self, dict exc):
        cdef unicode pos_str
        cdef unicode form_str
        cdef unicode lemma_str
        cdef dict entries
        cdef dict props
        cdef int lemma
        cdef id_t sic
        cdef univ_tag_t pos
        for pos_str, entries in exc.items():
            pos = self.tag_names.index(pos_str)
            for form_str, props in entries.items():
                lemma_str = props.get('L', form_str)
                sic = self.strings[form_str]
                cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
                cached.lemma = self.strings[lemma_str]
                set_morph_from_dict(&cached.morph, props)
                self._cache.set(pos, sic, <void*>cached)
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
    morph.number = props.get('number', 0)
    morph.tenspect = props.get('tenspect', 0)
    morph.mood = props.get('mood', 0)
    morph.gender = props.get('gender', 0)
    morph.person = props.get('person', 0)
    morph.case = props.get('case', 0)
    morph.misc = props.get('misc', 0)
--- a/spacy/orth.py
+++ b/spacy/orth.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 import unicodedata
 from unidecode import unidecode
 import re
 import math
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMapArray
-from .typedefs cimport hash_t
+from .typedefs cimport hash_t, id_t
 from .tokens cimport Tokens, Morphology
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -72,10 +72,9 @@ cdef class Tagger:
        return tag_id
 def _make_tag_dict(counts):
-    freq_thresh = 50
+    freq_thresh = 20
-    ambiguity_thresh = 0.98
+    ambiguity_thresh = 0.97
    tagdict = {}
    cdef atom_t word
    cdef atom_t tag