* Work on morphological processing

2025-09-19 02:22:43 +03:00 · 2014-12-08 21:12:15 +11:00 · 2014-12-08 21:12:15 +11:00 · 99bbbb6feb
commit 99bbbb6feb
parent 7b68f911cf
10 changed files with 261 additions and 21 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -5,6 +5,57 @@ from .tokens cimport Tokens
 from .tokens cimport TokenC
 cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
 cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS
    CARDINAL
    ORDINAL
 cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
 cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL
 cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    ACCUSATIVE
    GENITIVE
    DEMONYM
 cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME
    URL
    EMAIL
    EMOTICON
 # Flags
 cpdef enum FlagID:
    IS_ALPHA
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -35,6 +35,63 @@ from __future__ import unicode_literals
 cimport lang
 from .typedefs cimport flags_t
 import orth
 from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from .tagger cimport X, PUNCT, EOL
 POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (NOUN, {}),
    'PRP$': (NOUN, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
 }
 POS_TEMPLATES = (
@ -91,19 +148,25 @@ cdef class English(Language):
    def set_pos(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
+            fill_pos_context(context, i, t)
-            tokens.data[i].pos = self.pos_tagger.predict(context)
+            t[i].pos = self.pos_tagger.predict(context)
            #self.morphalyser.set_token(&t[i])
    def train_pos(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
-            fill_pos_context(context, i, tokens.data)
+            fill_pos_context(context, i, t)
-            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            t[i].pos = self.pos_tagger.predict(context, [golds[i]])
-            c += tokens.data[i].pos == golds[i]
+            t[i].morph = self.pos_tagger.tags[t[i].pos].morph
            #self.analyse_morph(&t[i].lemma, &t[i].morph, t[i].pos, t[i].lex)
            c += t[i].pos == golds[i]
        return c
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -2,20 +2,20 @@ from libcpp.vector cimport vector
 from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
-from preshed.maps cimport PreshMap
+from preshed.maps cimport PreshMap, PreshMapArray
 from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens, TokenC
 from .lexeme cimport Lexeme
 from .tagger cimport Tagger
 from .tagger cimport PosTag
 from .utf8string cimport StringStore, UniStr
 cdef class Lexicon:
    cpdef public get_lex_props
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
    cdef vector[Lexeme*] lexemes
@ -29,13 +29,17 @@ cdef class Language:
    cdef readonly unicode name
    cdef PreshMap _cache
    cdef PreshMap _specials
    cdef PreshMapArray _lemmas
    cpdef readonly Lexicon lexicon
    cpdef readonly Tagger pos_tagger
    cpdef readonly object lemmatizer
    cdef object _prefix_re
    cdef object _suffix_re
    cdef object _infix_re
    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -14,6 +14,7 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 from cymem.cymem cimport Pool
 from murmurhash.mrmr cimport hash64
 from preshed.maps cimport PreshMap
 from .lemmatizer import Lemmatizer
 from .lexeme cimport Lexeme
 from .lexeme cimport EMPTY_LEXEME
@ -26,6 +27,8 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens
 from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
 cdef class Language:
    def __init__(self, name):
@ -39,14 +42,40 @@ cdef class Language:
        self._infix_re = re.compile(infix)
        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
        self._lemmas = PreshMapArray(N_UNIV_TAGS)
        self.pos_tagger = None
        self.lemmatizer = None
    def load(self):
        self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
        self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
        self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
        if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
            self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
    cdef int lemmatize(self, const PosTag* pos, const Lexeme* lex) except -1:
        if self.lemmatizer is None:
            return lex.sic
        if pos.pos != NOUN and pos.pos != VERB and pos.pos != ADJ:
            return lex.sic
        cdef int lemma = <int><size_t>self._lemmas.get(pos.pos, lex.sic)
        if lemma != 0:
            return lemma
        cdef bytes py_string = self.lexicon.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
        if pos.pos == NOUN:
            lemma_strings = self.lemmatizer.noun(py_string)
        elif pos.pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
            assert pos.pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
        self._lemmas.set(pos.pos, lex.sic, <void*>lemma)
        return lemma
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
        cdef Tokens tokens = Tokens(self.lexicon.strings, length)
@ -254,9 +283,11 @@ cdef class Lexicon:
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.size = 2
        self.get_lex_props = get_props
    def __len__(self):
        return self.lexemes.size()
    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
        if necessary, using memory acquired from the given pool.  If the pool
@ -269,14 +300,13 @@ cdef class Lexicon:
            mem = self.mem
        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+        lex[0] = lexeme_init(self.lexemes.size(), py_string, string.key, self.strings,
                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)
            while self.lexemes.size() < (lex.id + 1):
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lex.id] = lex
            self.size += 1
        else:
            lex[0].id = 1
        return lex
@ -302,6 +332,8 @@ cdef class Lexicon:
                a dict if the operator is called from Python.
        '''
        if type(id_or_string) == int:
            if id_or_string >= self.lexemes.size():
                raise IndexError
            return self.lexemes.at(id_or_string)[0]
        cdef UniStr string
        slice_unicode(&string, id_or_string, 0, len(id_or_string))
@ -359,5 +391,4 @@ cdef class Lexicon:
                self.lexemes.push_back(&EMPTY_LEXEME)
            self.lexemes[lexeme.id] = lexeme
            i += 1
            self.size += 1
        fclose(fp)
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -53,6 +53,7 @@ class Lemmatizer(object):
 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
    if string in index:
        forms.append(string)
@ -62,6 +63,8 @@ def lemmatize(string, index, exceptions, rules):
            form = string[:len(string) - len(old)] + new
            if form in index:
                forms.append(form)
    if not forms:
        forms.append(string)
    return set(forms)
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -147,6 +147,7 @@ Y	PRT
 Z	NOUN
 ^	NOUN
 ~	X
-``	.""".strip().split('\n'))
+``	.
 EOL EOL""".strip().split('\n'))
    return mapping[tag]
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -1,11 +1,40 @@
 from libc.stdint cimport uint8_t
 from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMapArray
 from .typedefs cimport hash_t
-from .tokens cimport Tokens
+from .tokens cimport Tokens, Morphology
 # Google universal tag set
 cdef enum univ_tag_t:
    NO_TAG
    ADJ
    ADV
    ADP
    CONJ
    DET
    NOUN
    NUM
    PRON
    PRT
    VERB
    X
    PUNCT
    EOL
    N_UNIV_TAGS
 cdef struct PosTag:
    Morphology morph
    int id
    univ_tag_t pos
 cdef class Tagger:
@ -16,4 +45,5 @@ cdef class Tagger:
    cpdef readonly LinearModel model
    cpdef readonly list tag_names
    cdef PosTag* tags
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats
-def setup_model_dir(tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'tag_names': tag_names,
        'tag_map': tag_map,
        'tag_counts': tag_counts,
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
@ -33,16 +34,31 @@ cdef class Tagger:
        self.mem = Pool()
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        tag_map = cfg['tag_map']
        univ_counts = {}
        cdef unicode tag
        cdef unicode univ_tag
        self.tag_names = cfg['tag_names']
        self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
        for i, tag in enumerate(self.tag_names):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
            self.tags[i].morph.number = props.get('number', 0)
            self.tags[i].morph.tenspect = props.get('tenspect', 0)
            self.tags[i].morph.mood = props.get('mood', 0)
            self.tags[i].morph.gender = props.get('gender', 0)
            self.tags[i].morph.person = props.get('person', 0)
            self.tags[i].morph.case = props.get('case', 0)
            self.tags[i].morph.misc = props.get('misc', 0)
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
-    cdef class_t predict(self, const atom_t* context, object golds=None) except *:
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
-        """Predict the tag of tokens[i].  The tagger remembers the features and
+        """Predict the tag of tokens[i].
        prediction, in case you later call tell_answer.
        >>> tokens = EN.tokenize(u'An example sentence.')
        >>> tag = EN.pos_tagger.predict(0, tokens)
@ -69,6 +85,24 @@ cdef class Tagger:
        return tag_id
 UNIV_TAGS = {
    'NULL': NO_TAG,
    'ADJ': ADJ,
    'ADV': ADV,
    'ADP': ADP,
    'CONJ': CONJ,
    'DET': DET,
    'NOUN': NOUN,
    'NUM': NUM,
    'PRON': PRON,
    'PRT': PRT,
    'VERB': VERB,
    'X': X,
    '.': PUNCT,
    'EOL': EOL
 }
 def _make_tag_dict(counts):
    freq_thresh = 50
    ambiguity_thresh = 0.98
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -5,14 +5,29 @@ from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t
 from .lexeme cimport Lexeme
 from .typedefs cimport flags_t
 from .utf8string cimport StringStore
 from libc.stdint cimport uint8_t, uint16_t
 cdef struct Morphology:
    uint8_t number
    uint8_t tenspect # Tense/aspect/voice
    uint8_t mood
    uint8_t gender
    uint8_t person
    uint8_t case
    uint8_t misc
 cdef struct TokenC:
    const Lexeme* lex
    Morphology morph
    int idx
    int pos
    int lemma
    int sense
@ -37,7 +52,7 @@ cdef class Token:
    cdef public int i
    cdef public int idx
    cdef public int pos
-    cdef public int ner
+    cdef int lemma
    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -51,7 +51,7 @@ cdef class Tokens:
    def __getitem__(self, i):
        bounds_check(i, self.length, PADDING)
        return Token(self._string_store, i, self.data[i].idx, self.data[i].pos,
-                     self.data[i].sense, self.data[i].lex[0])
+                     self.data[i].lemma, self.data[i].lex[0])
    def __iter__(self):
        for i in range(self.length):
@ -128,14 +128,15 @@ cdef class Tokens:
@cython.freelist(64)
 cdef class Token:
-    def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
+    def __init__(self, StringStore string_store, int i, int idx, int pos, int lemma,
                 dict lex):
        self._string_store = string_store
        self.idx = idx
        self.pos = pos
        self.ner = ner
        self.i = i
        self.id = lex['id']
        self.lemma = lemma
        self.cluster = lex['cluster']
        self.length = lex['length']
@ -156,3 +157,10 @@ cdef class Token:
                return ''
            cdef bytes utf8string = self._string_store[self.sic]
            return utf8string.decode('utf8')
    property lemma:
        def __get__(self):
            if self.lemma == 0:
                return self.string
            cdef bytes utf8string = self._string_store[self.lemma]
            return utf8string.decode('utf8')