* Work on fixing special-cases, reading them in as JSON objects so that they can specify lemmas

2025-10-31 07:57:35 +03:00 · 2014-12-09 14:48:01 +11:00 · 2014-12-09 14:48:01 +11:00 · 302e09018b
commit 302e09018b
parent cda9ea9a4a
8 changed files with 136 additions and 70 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -10,6 +10,7 @@ cpdef enum en_person_t:
    FIRST
    SECOND
    THIRD
    NON_THIRD
 cpdef enum en_number_t:
@ -17,14 +18,22 @@ cpdef enum en_number_t:
    SINGULAR
    PLURAL
    MASS
    CARDINAL
    ORDINAL
 cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER
 cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM
 cpdef enum en_tenspect_t:
@ -37,23 +46,12 @@ cpdef enum en_tenspect_t:
    MODAL
 cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    ACCUSATIVE
    GENITIVE
    DEMONYM
 cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME
    URL
    EMAIL
    EMOTICON
 # Flags
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -38,6 +38,8 @@ import orth
 from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .tagger cimport X, PUNCT, EOL
 from .tokens cimport Morphology
 POS_TAGS = {
    'NULL': (NO_TAG, {}),
@ -152,7 +154,8 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context)
-            #self.morphalyser.set_token(&t[i])
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
    def train_pos(self, Tokens tokens, golds):
        cdef int i
@ -162,11 +165,27 @@ cdef class English(Language):
        for i in range(tokens.length):
            fill_pos_context(context, i, t)
            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
+            _merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
-            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
+            t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
            c += t[i].pos == golds[i]
        return c
 cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
    if tok_morph.number == 0:
        tok_morph.number = pos_morph.number
    if tok_morph.tenspect == 0:
        tok_morph.tenspect = pos_morph.tenspect
    if tok_morph.mood == 0:
        tok_morph.mood = pos_morph.mood
    if tok_morph.gender == 0:
        tok_morph.gender = pos_morph.gender
    if tok_morph.person == 0:
        tok_morph.person = pos_morph.person
    if tok_morph.case == 0:
        tok_morph.case = pos_morph.case
    if tok_morph.misc == 0:
        tok_morph.misc = pos_morph.misc
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -9,7 +9,7 @@ from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
-from .tagger cimport PosTag
+from .tagger cimport univ_tag_t
 from .utf8string cimport StringStore, UniStr
@ -38,11 +38,12 @@ cdef class Language:
    cdef object _suffix_re
    cdef object _infix_re
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
                             vector[Lexeme*] *suffixes) except NULL
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -28,6 +28,7 @@ from .util import read_lang_data
 from .tokens import Tokens
 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
 from .tokens cimport Morphology
 cdef class Language:
@ -53,27 +54,27 @@ cdef class Language:
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
-    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
+    cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
-        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
+        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.sic
-        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
+        cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.lexicon.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
-        if pos.pos == NOUN:
+        if pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos.pos == VERB:
+        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
-            assert pos.pos == ADJ
+            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
-        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
+        self._lemmas.set(pos, lex.sic, <void*>lemma)
        return lemma
    cpdef Tokens tokens_from_list(self, list strings):
@ -111,6 +112,7 @@ cdef class Language:
            return tokens
        cdef int i = 0
        cdef int start = 0
        cdef bint cache_hit
        cdef Py_UNICODE* chars = string
        cdef bint in_ws = Py_UNICODE_ISSPACE(chars[0])
        cdef UniStr span
@ -118,10 +120,8 @@ cdef class Language:
            if Py_UNICODE_ISSPACE(chars[i]) != in_ws:
                if start < i:
                    slice_unicode(&span, chars, start, i)
-                    lexemes = <const Lexeme* const*>self._cache.get(span.key)
+                    cache_hit = self._try_cache(start, span.key, tokens)
-                    if lexemes != NULL:
+                    if not cache_hit:
                        tokens.extend(start, lexemes, 0)
                    else: 
                        self._tokenize(tokens, &span, start, i)
                in_ws = not in_ws
                start = i
@ -130,13 +130,32 @@ cdef class Language:
        i += 1
        if start < i:
            slice_unicode(&span, chars, start, i)
-            lexemes = <const Lexeme* const*>self._cache.get(span.key)
+            cache_hit = self._try_cache(start, span.key, tokens)
-            if lexemes != NULL:
+            if not cache_hit:
                tokens.extend(start, lexemes, 0)
            else: 
                self._tokenize(tokens, &span, start, i)
        return tokens
    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
        cdef int i
        specials = <TokenC*>self._specials.get(key)
        if specials != NULL:
            i = 0
            while specials[i].lex != NULL:
                tokens.push_back(idx, specials[i].lex)
                tokens.data[tokens.length - 1].pos = specials[i].pos
                tokens.data[tokens.length - 1].morph = specials[i].morph
                tokens.data[tokens.length - 1].lemma = specials[i].lemma
                tokens.data[tokens.length - 1].sense = specials[i].sense
                i += 1
            return True
        else:
            cached = <const Lexeme* const*>self._cache.get(key)
            if cached != NULL:
                tokens.extend(i, cached, 0)
                return True
            else:
                return False
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
@ -190,10 +209,10 @@ cdef class Language:
                break
        return string
-    cdef int _attach_tokens(self, Tokens tokens,
+    cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
                            int idx, UniStr* string,
                            vector[const Lexeme*] *prefixes,
                            vector[const Lexeme*] *suffixes) except -1:
        cdef bint cache_hit
        cdef int split
        cdef const Lexeme* const* lexemes
        cdef Lexeme* lexeme
@ -201,10 +220,9 @@ cdef class Language:
        if prefixes.size():
            idx = tokens.extend(idx, prefixes.data(), prefixes.size())
        if string.n != 0:
-
+            cache_hit = self._try_cache(idx, string.key, tokens)
-            lexemes = <const Lexeme* const*>self._cache.get(string.key)
+            if cache_hit:
-            if lexemes != NULL:
+                idx = tokens.data[tokens.length - 1].idx + 1
                idx = tokens.extend(idx, lexemes, 0)
            else:
                split = self._find_infix(string.chars, string.n)
                if split == 0 or split == -1:
@ -247,30 +265,42 @@ cdef class Language:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
-    def _load_special_tokenization(self, token_rules):
+    def _load_special_tokenization(self, object rules):
-        '''Load special-case tokenization rules.
+        '''Add a special-case tokenization rule.
        Loads special-case tokenization rules into the Language._cache cache,
        read from data/<lang>/tokenization . The special cases are loaded before
        any language data is tokenized, giving these priority.  For instance,
        the English tokenization rules map "ain't" to ["are", "not"].
        Args:
            token_rules (list): A list of (chunk, tokens) pairs, where chunk is
                a string and tokens is a list of strings.
        '''
        cdef int i
        cdef unicode chunk
        cdef list substrings
        cdef unicode form
        cdef unicode lemma
        cdef dict props
        cdef Lexeme** lexemes
        cdef hash_t hashed
        cdef UniStr string
-        for uni_string, substrings in token_rules:
+        for chunk, substrings in sorted(rules.items()):
-            lexemes = <Lexeme**>self.mem.alloc(len(substrings) + 1, sizeof(Lexeme*))
+            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
-            for i, substring in enumerate(substrings):
+            for i, props in enumerate(substrings):
-                slice_unicode(&string, substring, 0, len(substring))
+                form = props['F']
-                lexemes[i] = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
+                lemma = props.get("L", None)
-            lexemes[i + 1] = NULL
+                slice_unicode(&string, form, 0, len(form))
-            slice_unicode(&string, uni_string, 0, len(uni_string))
+                tokens[i].lex = <Lexeme*>self.lexicon.get(self.lexicon.mem, &string)
-            self._specials.set(string.key, lexemes)
+                if lemma:
-            self._cache.set(string.key, lexemes)
+                    tokens[i].lemma = self.lexicon.strings[lemma]
                set_morph_from_dict(&tokens[i].morph, props)
            # Null-terminated array
            tokens[i+1].lex = NULL
            slice_unicode(&string, chunk, 0, len(chunk))
            self._specials.set(string.key, tokens)
 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
    morph.number = props.get('number', 0)
    morph.tenspect = props.get('tenspect', 0)
    morph.mood = props.get('mood', 0)
    morph.gender = props.get('gender', 0)
    morph.person = props.get('person', 0)
    morph.case = props.get('case', 0)
    morph.misc = props.get('misc', 0)
 cdef class Lexicon:
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -21,7 +21,6 @@ cdef struct Morphology:
    uint8_t misc
 cdef struct TokenC:
    const Lexeme* lex
    Morphology morph
--- a/spacy/util.py
+++ b/spacy/util.py
@ -13,7 +13,8 @@ def utf8open(loc, mode='r'):
 def read_lang_data(name):
    data_dir = path.join(DATA_DIR, name)
-    tokenization = read_tokenization(name)
+    with open(path.join(data_dir, 'specials.json')) as file_:
        tokenization = ujson.load(file_)
    prefix = read_prefix(data_dir)
    suffix = read_suffix(data_dir)
    infix = read_infix(data_dir)
@ -26,12 +27,17 @@ def read_prefix(data_dir):
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression
 def read_suffix(data_dir):
-    with  utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join([re.escape(piece) + '$' for piece in entries if piece.strip()])
    # TODO: Fix this hack!
    expression += r'|(?<=[a-z0-9])\.$'
    expression += r'|(?<=[0-9])km$'
    return expression
 def read_infix(data_dir):
    with utf8open(path.join(data_dir, 'infix')) as file_:
        entries = file_.read().split('\n')
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -20,15 +20,18 @@ def test_apostrophe():
 def test_LL():
    tokens = EN.tokenize("we'll")
    assert len(tokens) == 2
-    assert tokens[1].string == "will"
+    assert tokens[1].string == "'ll"
    assert tokens[1].lemma == "will"
    assert tokens[0].string == "we"
 def test_aint():
    tokens = EN.tokenize("ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "are"
+    assert tokens[0].string == "ai"
-    assert tokens[1].string == "not"
+    assert tokens[0].lemma == "be"
    assert tokens[1].string == "n't"
    assert tokens[1].lemma == "not"
 def test_capitalized():
@ -38,7 +41,8 @@ def test_capitalized():
    assert len(tokens) == 2
    tokens = EN.tokenize("Ain't")
    assert len(tokens) == 2
-    assert tokens[0].string == "Are"
+    assert tokens[0].string == "Ai"
    assert tokens[0].lemma == "be"
 def test_punct():
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -34,7 +34,7 @@ def test_digits():
 def test_contraction():
    tokens = EN.tokenize("don't giggle")
    assert len(tokens) == 3
-    assert tokens[1].sic == EN.lexicon["not"]['sic']
+    assert tokens[1].sic == EN.lexicon["n't"]['sic']
    tokens = EN.tokenize("i said don't!")
    assert len(tokens) == 5
    assert tokens[4].sic == EN.lexicon['!']['sic']
@ -71,30 +71,39 @@ def test_cnts1():
    tokens = EN.tokenize(text)
    assert len(tokens) == 8
 def test_cnts2():
    text = u"""U.N. regulations are not a part of their concern."""
    tokens = EN.tokenize(text)
    assert len(tokens) == 10
 def test_cnts3():
    text = u"“Isn't it?”"
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
    assert len(words) == 6
 def test_cnts4():
    text = u"""Yes! "I'd rather have a walk", Ms. Comble sighed. """
    tokens = EN.tokenize(text)
-    assert len(tokens) == 15
+    words = [t.string for t in tokens]
    assert len(words) == 15
 def test_cnts5():
    text = """'Me too!', Mr. P. Delaware cried. """
    tokens = EN.tokenize(text)
    assert len(tokens) == 11
 def test_cnts6():
    text = u'They ran about 10km.'
    tokens = EN.tokenize(text)
-    assert len(tokens) == 6
+    words = [t.string for t in tokens]
    assert len(words) == 6
 #def test_cnts7():
 #    text = 'But then the 6,000-year ice age came...'