* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules

2025-09-18 18:12:45 +03:00 · 2014-12-07 23:52:41 +11:00 · 2014-12-07 23:52:41 +11:00 · ef4398b204
commit ef4398b204
parent 327383e38a
11 changed files with 127 additions and 154 deletions
--- a/setup.py
+++ b/setup.py
@ -55,7 +55,6 @@ exts = [
    Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -1,64 +1 @@
 from thinc.typedefs cimport atom_t
 from .tokens cimport TokenC
 cpdef enum:
    P2_sic
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_sense
    P1_sic
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_sense
    W_sic
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_sense
    N1_sic
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_sense
    N2_sic
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_sense
    N_FIELDS
 cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.sense
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,5 +1,9 @@
-from spacy.lang cimport Language
+from thinc.typedefs cimport atom_t
-from spacy.tokens cimport Tokens
+
 from .lang cimport Language
 from .tokens cimport Tokens
 from .tokens cimport TokenC
 # Flags
 cpdef enum FlagID:
@ -28,5 +32,67 @@ cpdef enum FlagID:
    IN_NAMES
 cpdef enum:
    P2_sic
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_sense
    P1_sic
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_sense
    W_sic
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_sense
    N1_sic
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_sense
    N2_sic
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_sense
    N_CONTEXT_FIELDS
 cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])
 cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.sense
 cdef class English(Language):
    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
 # TODO
 #The script translate_treebank_tokenization can be used to transform a treebank's
 #annotation to use one of the spacy tokenization schemes.
 from __future__ import unicode_literals
 cimport lang
@ -42,6 +37,32 @@ from .typedefs cimport flags_t
 import orth
 POS_TEMPLATES = (
    (W_sic,),
    (P1_sic,),
    (N1_sic,),
    (N2_sic,),
    (P2_sic,),
    (W_suffix,),
    (W_prefix,),
    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_sic),
    (P1_suffix,),
    (N1_suffix,),
    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),
 )
 cdef class English(Language):
    """English tokenizer, tightly coupled to lexicon.
@ -49,6 +70,9 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
    def get_props(self, unicode string):
        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
    def set_flags(self, unicode string):
        cdef flags_t flags = 0
        flags |= orth.is_alpha(string) << IS_ALPHA
@ -64,5 +88,22 @@ cdef class English(Language):
        flags |= orth.like_number(string) << LIKE_NUMBER
        return flags
    def set_pos(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        for i in range(tokens.length):
            fill_pos_context(context, i, tokens.data)
            tokens.data[i].pos = self.pos_tagger.predict(context)
    def train_pos(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
        for i in range(tokens.length):
            fill_pos_context(context, i, tokens.data)
            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
            c += tokens.data[i].pos == golds[i]
        return c
 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr
 cdef class Lexicon:
-    cpdef public set_flags
+    cpdef public get_lex_props
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -37,7 +37,7 @@ cdef class Language:
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon(self.set_flags)
+        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
        self.pos_tagger = None
@ -249,13 +249,13 @@ cdef class Lexicon:
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self, object set_flags=None):
+    def __init__(self, object get_props):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.size = 2
-        self.set_flags = set_flags
+        self.get_lex_props = get_props
    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
@ -267,9 +267,10 @@ cdef class Lexicon:
            return lex
        if string.n < 3:
            mem = self.mem
        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
+        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
-                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
+                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)
            while self.lexemes.size() < (lex.id + 1):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -72,17 +72,14 @@ cpdef enum attr_id_t:
    ID
    SIC
    STEM
    DENSE
    SHAPE
    ASCIIED
    PREFIX
    SUFFIX
    LENGTH
    CLUSTER
    POS_TYPE
    SENSE_TYPE
 cdef struct Lexeme:
@ -90,20 +87,16 @@ cdef struct Lexeme:
    attr_t id
    attr_t sic
    attr_t stem
    attr_t dense
    attr_t shape
    attr_t asciied
    attr_t prefix
    attr_t suffix
    attr_t length
    attr_t cluster
    attr_t pos_type
    attr_t sense_type
    float prob
    float lower_pc
    float sentiment
@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
        return lex.sic
    elif feat_name == DENSE:
        return lex.dense
    elif feat_name == STEM:
        return lex.stem
    elif feat_name == SHAPE:
        return lex.shape
    elif feat_name == ASCIIED:
        return lex.asciied
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
    elif feat_name == SENSE_TYPE:
        return lex.sense_type
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
    lex.cluster = props.get('cluster', 0)
    lex.pos_type = props.get('pos_type', 0)
    lex.sense_type = props.get('sense_type', 0)
    lex.prob = props.get('prob', 0)
    lex.lower_pc = props.get('lower_pc', 0.0)
    lex.prefix = string_store[string[:1]]
    lex.suffix = string_store[string[-3:]]
    lex.shape = string_store[orth.word_shape(string)]
-    lex.dense = lex.sic if lex.prob >= -10 else lex.shape
+    lex.dense = string_store[props['dense']]
    lex.stem = string_store[props.get('stem', string)]
    lex.asciied = string_store[orth.asciied(string)]
    lex.flags = props.get('flags', 0)
    return lex
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from preshed.maps cimport PreshMap
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 cpdef enum TagType:
    POS
    SENSE
 cdef class Tagger:
-    cpdef int set_tags(self, Tokens tokens) except -1
+    cdef class_t predict(self, atom_t* context, object golds=*) except *
    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
    cpdef readonly Pool mem
    cpdef readonly Extractor extractor
    cpdef readonly LinearModel model
    cpdef readonly TagType tag_type
    cpdef readonly list tag_names
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -2,9 +2,6 @@
 from __future__ import unicode_literals
 from __future__ import division
 from .context cimport fill_context
 from .context cimport N_FIELDS
 from os import path
 import os
 import shutil
@ -15,12 +12,11 @@ import cython
 from thinc.features cimport Feature, count_feats
-def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_counts, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'tag_type': tag_type,
        'templates': templates,
        'tag_names': tag_names,
        'tag_counts': tag_counts,
@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
        json.dump(config, file_)
 def train(train_sents, model_dir, nr_iter=10):
    cdef Tokens tokens
    cdef Tagger tagger = Tagger(model_dir)
    cdef int i
    cdef class_t guess = 0
    cdef class_t gold
    for _ in range(nr_iter):
        n_corr = 0
        total = 0
        for tokens, golds in train_sents:
            assert len(tokens) == len(golds), [t.string for t in tokens]
            for i in range(tokens.length):
                gold = golds[i]
                guess = tagger.predict(i, tokens, [gold])
                tokens.set_tag(i, tagger.tag_type, guess)
                total += 1
                n_corr += guess == gold
        print('%.4f' % ((n_corr / total) * 100))
        random.shuffle(train_sents)
    tagger.model.end_training()
    tagger.model.dump(path.join(model_dir, 'model'))
 cdef class Tagger:
    """Assign part-of-speech, named entity or supersense tags, using greedy
    decoding.  The tagger reads its model and configuration from disk.
@ -61,26 +34,13 @@ cdef class Tagger:
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        self.tag_names = cfg['tag_names']
        self.tag_type = cfg['tag_type']
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
-    cpdef int set_tags(self, Tokens tokens) except -1:
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
        """Assign tags to a Tokens object.
        >>> tokens = EN.tokenize(u'An example sentence.')
        >>> assert tokens[0].pos == 'NO_TAG'
        >>> EN.pos_tagger.set_tags(tokens)
        >>> assert tokens[0].pos == 'DT'
        """
        cdef int i
        for i in range(tokens.length):
            tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
        """Predict the tag of tokens[i].  The tagger remembers the features and
        prediction, in case you later call tell_answer.
@ -88,11 +48,6 @@ cdef class Tagger:
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
        cdef atom_t sic = tokens.data[i].lex.sic
        if sic in self.tagdict:
            return self.tagdict[sic]
        cdef atom_t[N_FIELDS] context
        fill_context(context, i, tokens.data)
        cdef int n_feats
        cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
        cdef weight_t* scores = self.model.get_scores(feats, n_feats)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -140,11 +140,11 @@ cdef class Token:
        self.cluster = lex['cluster']
        self.length = lex['length']
        self.postype = lex['pos_type']
-        self.sensetype = lex['sense_type']
+        self.sensetype = 0
        self.sic = lex['sic']
        self.norm = lex['dense']
        self.shape = lex['shape']
-        self.suffix = lex['asciied']
+        self.suffix = lex['suffix']
        self.prefix = lex['prefix']
        self.prob = lex['prob']