* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas

2025-08-24 14:04:56 +03:00 · 2014-12-09 14:48:01 +11:00 · 2014-12-09 14:48:01 +11:00 · 302e09018b
commit 302e09018b
parent cda9ea9a4a
8 changed files with 136 additions and 70 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -10,6 +10,7 @@ cpdef enum en_person_t:
    FIRST
    SECOND
    THIRD
+    NON_THIRD


 cpdef enum en_number_t:
@ -17,14 +18,22 @@ cpdef enum en_number_t:
    SINGULAR
    PLURAL
    MASS
-    CARDINAL
-    ORDINAL


 cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
+    NEUTER
+
+
+cpdef enum en_case_t:
+    NO_CASE
+    NOMINATIVE
+    GENITIVE
+    ACCUSATIVE
+    REFLEXIVE
+    DEMONYM


 cpdef enum en_tenspect_t:
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
    MODAL


-cpdef enum en_case_t:
-    NO_CASE
-    NOMINATIVE
-    ACCUSATIVE
-    GENITIVE
-    DEMONYM
-
-
 cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME
-    URL
-    EMAIL
-    EMOTICON

    
 # Flags
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,6 +38,8 @@ import orth
 from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .tagger cimport X, PUNCT, EOL

+from .tokens cimport Morphology
+

 POS_TAGS = {
    'NULL': (NO_TAG, {}),
@ -152,7 +154,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context)
-            #self.morphalyser.set_token(&t[i])
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)

    def train_pos(self, Tokens tokens, golds):
        cdef int i
@ -162,11 +165,27 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
-            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
            c += t[i].pos == golds[i]
        return c


+cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
+    if tok_morph.number == 0:
+        tok_morph.number = pos_morph.number
+    if tok_morph.tenspect == 0:
+        tok_morph.tenspect = pos_morph.tenspect
+    if tok_morph.mood == 0:
+        tok_morph.mood = pos_morph.mood
+    if tok_morph.gender == 0:
+        tok_morph.gender = pos_morph.gender
+    if tok_morph.person == 0:
+        tok_morph.person = pos_morph.person
+    if tok_morph.case == 0:
+        tok_morph.case = pos_morph.case
+    if tok_morph.misc == 0:
+        tok_morph.misc = pos_morph.misc
+

 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -9,7 +9,7 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport PosTag
+from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr


@ -38,11 +38,12 @@ cdef class Language:
    cdef object _suffix_re
    cdef object _infix_re

-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1

    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)

+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except NULL
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -28,6 +28,7 @@ from .util import read_lang_data
 from .tokens import Tokens

 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
+from .tokens cimport Morphology


 cdef class Language:
@ -53,27 +54,27 @@ cdef class Language:
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))

-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
-        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.lexicon.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
-        if pos.pos == NOUN:
+        if pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos.pos == VERB:
+        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
-            assert pos.pos == ADJ
+            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
        return lemma

    cpdef Tokens tokens_from_list(self, list strings):
@ -111,6 +112,7 @@ cdef class Language:
            return tokens
        cdef int i = 0
        cdef int start = 0
+        cdef bint cache_hit
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
        cdef UniStr span
@ -118,10 +120,8 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    slice_unicode(&span, chars, start, i)
-                    lexemes = <const Lexeme* const*>self._cache.get(span.key)
-                    if lexemes != NULL:
-                        tokens.extend(start, lexemes, 0)
-                    else: 
+                    cache_hit = self._try_cache(start, span.key, tokens)
+                    if not cache_hit:
                        self._tokenize(tokens, &span, start, i)
                in_ws = not in_ws
                start = i
@ -130,13 +130,32 @@ cdef class Language:
        i += 1
        if start < i:
            slice_unicode(&span, chars, start, i)
-            lexemes = <const Lexeme* const*>self._cache.get(span.key)
-            if lexemes != NULL:
-                tokens.extend(start, lexemes, 0)
-            else: 
+            cache_hit = self._try_cache(start, span.key, tokens)
+            if not cache_hit:
                self._tokenize(tokens, &span, start, i)
        return tokens

+    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
+        cdef int i
+        specials = <TokenC*>self._specials.get(key)
+        if specials != NULL:
+            i = 0
+            while specials[i].lex != NULL:
+                tokens.push_back(idx, specials[i].lex)
+                tokens.data[tokens.length - 1].pos = specials[i].pos
+                tokens.data[tokens.length - 1].morph = specials[i].morph
+                tokens.data[tokens.length - 1].lemma = specials[i].lemma
+                tokens.data[tokens.length - 1].sense = specials[i].sense
+                i += 1
+            return True
+        else:
+            cached = <const Lexeme* const*>self._cache.get(key)
+            if cached != NULL:
+                tokens.extend(i, cached, 0)
+                return True
+            else:
+                return False
+
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
@ -190,10 +209,10 @@ cdef class Language:
                break
        return string

-    cdef int _attach_tokens(self, Tokens tokens,
-                            int idx, UniStr* string,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                            vector[const Lexeme*] *prefixes,
                            vector[const Lexeme*] *suffixes) except -1:
+        cdef bint cache_hit
        cdef int split
        cdef const Lexeme* const* lexemes
        cdef Lexeme* lexeme
@ -201,10 +220,9 @@ cdef class Language:
        if prefixes.size():
            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
-
-            lexemes = <const Lexeme* const*>self._cache.get(string.key)
-            if lexemes != NULL:
-                idx = tokens.extend(idx, lexemes, 0)
+            cache_hit = self._try_cache(idx, string.key, tokens)
+            if cache_hit:
+                idx = tokens.data[tokens.length - 1].idx + 1
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1:
@ -247,30 +265,42 @@ cdef class Language:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

-    def _load_special_tokenization(self, token_rules):
-        '''Load special-case tokenization rules.
-
-        Loads special-case tokenization rules into the Language._cache cache,
-        read from data/<lang>/tokenization . The special cases are loaded before
-        any language data is tokenized, giving these priority.  For instance,
-        the English tokenization rules map "ain't" to ["are", "not"].
-
-        Args:
-            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
-                a string and tokens is a list of strings.
+    def _load_special_tokenization(self, object rules):
+        '''Add a special-case tokenization rule.
        '''
+        cdef int i
+        cdef unicode chunk
+        cdef list substrings
+        cdef unicode form
+        cdef unicode lemma
+        cdef dict props
        cdef Lexeme** lexemes
        cdef hash_t hashed
        cdef UniStr string
-        for uni_string, substrings in token_rules:
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
-            for i, substring in enumerate(substrings):
-                slice_unicode(&string, substring, 0, len(substring))
-                lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
-            lexemes[i + 1] = NULL
-            slice_unicode(&string, uni_string, 0, len(uni_string))
-            self._specials.set(string.key, lexemes)
-            self._cache.set(string.key, lexemes)
+        for chunk, substrings in sorted(rules.items()):
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+            for i, props in enumerate(substrings):
+                form = props['F']
+                lemma = props.get("L", None)
+                slice_unicode(&string, form, 0, len(form))
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
+                if lemma:
+                    tokens[i].lemma = self.lexicon.strings[lemma]
+                set_morph_from_dict(&tokens[i].morph, props)
+            # Null-terminated array
+            tokens[i+1].lex = NULL
+            slice_unicode(&string, chunk, 0, len(chunk))
+            self._specials.set(string.key, tokens)
+
+
+cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
+    morph.number = props.get('number', 0)
+    morph.tenspect = props.get('tenspect', 0)
+    morph.mood = props.get('mood', 0)
+    morph.gender = props.get('gender', 0)
+    morph.person = props.get('person', 0)
+    morph.case = props.get('case', 0)
+    morph.misc = props.get('misc', 0)


 cdef class Lexicon:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -21,7 +21,6 @@ cdef struct Morphology:
    uint8_t misc


-
 cdef struct TokenC:
    const Lexeme* lex
    Morphology morph
--- a/spacy/util.py
+++ b/spacy/util.py
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):

 def read_lang_data(name):
    data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
+        tokenization = ujson.load(file_)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)
@ -26,12 +27,17 @@ def read_prefix(data_dir):
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression

+
 def read_suffix(data_dir):
-    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
+    # TODO: Fix this hack!
+    expression += r'|(?<=[a-z0-9])\.$'
+    expression += r'|(?<=[0-9])km$'
    return expression

+
 def read_infix(data_dir):
    with utf8open(path.join(data_dir, 'infix')) as file_:
        entries = file_.read().split('\n')
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
    tokens = EN.tokenize("we'll")
    assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
+    assert tokens[1].lemma == "will"
    assert tokens[0].string == "we"


 def test_aint():
    tokens = EN.tokenize("ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "are"
-    assert tokens[1].string == "not"
+    assert tokens[0].string == "ai"
+    assert tokens[0].lemma == "be"
+    assert tokens[1].string == "n't"
+    assert tokens[1].lemma == "not"


 def test_capitalized():
@ -38,7 +41,8 @@ def test_capitalized():
    assert len(tokens) == 2
    tokens = EN.tokenize("Ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
+    assert tokens[0].lemma == "be"


 def test_punct():
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
    tokens = EN.tokenize("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
    tokens = EN.tokenize("i said don't!")
    assert len(tokens) == 5
    assert tokens[4].sic == EN.lexicon['!']['sic']
@ -71,30 +71,39 @@ def test_cnts1():
    tokens = EN.tokenize(text)
    assert len(tokens) == 8

+
 def test_cnts2():
    text = u"""U.N. regulations are not a part of their concern."""
    tokens = EN.tokenize(text)
    assert len(tokens) == 10

+
 def test_cnts3():
    text = u"“Isn't it?”"
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+

 def test_cnts4():
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
+    assert len(words) == 15
+

 def test_cnts5():
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = EN.tokenize(text)
    assert len(tokens) == 11

+
 def test_cnts6():
    text = u'They ran about 10km.'
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
+    assert len(words) == 6
+

 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'