* Working morphology and lemmatisation. POS tagging quite fast.

2025-07-15 18:52:29 +03:00 · 2014-12-10 08:09:32 +11:00 · 2014-12-10 08:09:32 +11:00 · 9959a64f7b
commit 9959a64f7b
parent 7831b06610
8 changed files with 48 additions and 38 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -53,7 +53,7 @@ cdef class Language:

    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
        if length == 0:
            return tokens
        cdef UniStr string_struct
@ -81,7 +81,7 @@ cdef class Language:
            tokens (Tokens): A Tokens object, giving access to a sequence of Lexemes.
        """
        cdef int length = len(string)
-        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
+        cdef Tokens tokens = Tokens(self, length)
        if length == 0:
            return tokens
        cdef int i = 0
@ -110,11 +110,10 @@ cdef class Language:
        return tokens

    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
-        cached = <Cached*>self._specials.get(key)
+        #cached = <Cached*>self._specials.get(key)
+        cached = <Cached*>self._cache.get(key)
        if cached == NULL:
-            cached = <Cached*>self._cache.get(key)
-            if cached == NULL:
-                return False
+            return False
        cdef int i
        if cached.is_lex:
            for i in range(cached.length):
@ -266,6 +265,7 @@ cdef class Language:
            cached.data.tokens = tokens
            slice_unicode(&string, chunk, 0, len(chunk))
            self._specials.set(string.key, cached)
+            self._cache.set(string.key, cached)


 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -80,6 +80,7 @@ cpdef enum attr_id_t:
    LENGTH
    CLUSTER
    POS_TYPE
+    LEMMA


 cdef struct Lexeme:
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,10 +1,13 @@
-from .tokens cimport TokenC, Morphology
+
+from .tokens cimport TokenC
 from .lexeme cimport Lexeme
 from .utf8string cimport StringStore
+from .typedefs cimport id_t, Morphology

 from preshed.maps cimport PreshMapArray
 from cymem.cymem cimport Pool

+
 # Google universal tag set
 cpdef enum univ_tag_t:
    NO_TAG
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -52,9 +52,9 @@ cdef class Morphologizer:
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
-        if path.exists(path.join(data_dir, 'morph.json')):
-            with open(path.join(data_dir, 'morph.json')) as file_:
-                self.load_exceptions(json.loads(file_))
+        if path.exists(path.join(data_dir, 'morphs.json')):
+            with open(path.join(data_dir, 'morphs.json')) as file_:
+                self.load_exceptions(json.load(file_))

    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -9,7 +9,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMapArray

 from .typedefs cimport hash_t, id_t
-from .tokens cimport Tokens, Morphology
+from .tokens cimport Tokens


 cdef class Tagger:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -7,19 +7,10 @@ from thinc.typedefs cimport atom_t
 from .lexeme cimport Lexeme

 from .typedefs cimport flags_t
-from .utf8string cimport StringStore
-from libc.stdint cimport uint8_t, uint16_t
+from .typedefs cimport Morphology
+from .lang cimport Language


-cdef struct Morphology:
-    uint8_t number
-    uint8_t tenspect # Tense/aspect/voice
-    uint8_t mood
-    uint8_t gender
-    uint8_t person
-    uint8_t case
-    uint8_t misc
-

 cdef struct TokenC:
    const Lexeme* lex
@ -40,7 +31,8 @@ ctypedef fused LexemeOrToken:

 cdef class Tokens:
    cdef Pool mem
-    cdef StringStore _string_store
+    cdef Language lang
+    cdef list tag_names

    cdef TokenC* data

@ -48,16 +40,15 @@ cdef class Tokens:
    cdef int max_length

    cdef int push_back(self, int i, LexemeOrToken lex_or_tok) except -1
-    cpdef int set_tag(self, int i, int tag_type, int tag) except -1

    cpdef np.ndarray[long, ndim=2] get_array(self, list features)


 cdef class Token:
-    cdef StringStore _string_store
+    cdef public Language lang
    cdef public int i
    cdef public int idx
-    cdef public int pos
+    cdef int pos
    cdef int lemma

    cdef public atom_t id
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -30,8 +30,8 @@ cdef class Tokens:
    >>> from spacy.en import EN
    >>> tokens = EN.tokenize('An example sentence.')
    """
-    def __init__(self, StringStore string_store, string_length=0):
-        self._string_store = string_store
+    def __init__(self, Language lang, string_length=0):
+        self.lang = lang
        if string_length >= 3:
            size = int(string_length / 3.0)
        else:
@ -50,7 +50,7 @@ cdef class Tokens:

    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
-        return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
+        return Token(self.lang, i, self.data[i].idx, self.data[i].pos,
                     self.data[i].lemma, self.data[i].lex[0])

    def __iter__(self):
@ -71,9 +71,6 @@ cdef class Tokens:
        self.length += 1
        return idx + t.lex.length

-    cpdef int set_tag(self, int i, int tag_type, int tag) except -1:
-        self.data[i].pos = tag
-
    @cython.boundscheck(False)
    cpdef np.ndarray[long, ndim=2] get_array(self, list attr_ids):
        cdef int i, j
@ -92,7 +89,10 @@ cdef class Tokens:

        cdef PreshCounter counts = PreshCounter(2 ** 8)
        for i in range(self.length):
-            attr = get_attr(self.data[i].lex, attr_id)
+            if attr_id == LEMMA:
+                attr = self.data[i].lemma
+            else:
+                attr = get_attr(self.data[i].lex, attr_id)
            counts.inc(attr, 1)
        return dict(counts)

@ -114,9 +114,9 @@ cdef class Tokens:

@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
-                 dict lex):
-        self._string_store = string_store
+    def __init__(self, Language lang, int i, int idx,
+                 int pos, int lemma, dict lex):
+        self.lang = lang
        self.idx = idx
        self.pos = pos
        self.i = i
@ -141,12 +141,16 @@ cdef class Token:
        def __get__(self):
            if self.sic == 0:
                return ''
-            cdef bytes utf8string = self._string_store[self.sic]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.sic]
            return utf8string.decode('utf8')

    property lemma:
        def __get__(self):
            if self.lemma == 0:
                return self.string
-            cdef bytes utf8string = self._string_store[self.lemma]
+            cdef bytes utf8string = self.lang.lexicon.strings[self.lemma]
            return utf8string.decode('utf8')
+
+    property pos:
+        def __get__(self):
+            return self.lang.pos_tagger.tag_names[self.pos]
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -1,4 +1,5 @@
 from libc.stdint cimport uint16_t, uint32_t, uint64_t, uintptr_t
+from libc.stdint cimport uint8_t

 ctypedef uint64_t hash_t
 ctypedef char* utf8_t
@ -7,3 +8,13 @@ ctypedef uint64_t flags_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
+
+
+cdef struct Morphology:
+    uint8_t number
+    uint8_t tenspect # Tense/aspect/voice
+    uint8_t mood
+    uint8_t gender
+    uint8_t person
+    uint8_t case
+    uint8_t misc