* Rearrange POS stuff, so that language-specific stuff can live in language-specific modules

2025-07-18 20:22:25 +03:00 · 2014-12-07 23:52:41 +11:00 · 2014-12-07 23:52:41 +11:00 · ef4398b204
commit ef4398b204
parent 327383e38a
11 changed files with 127 additions and 154 deletions
--- a/setup.py
+++ b/setup.py
@ -55,7 +55,6 @@ exts = [
    Extension("spacy.utf8string", ["spacy/utf8string.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.index", ["spacy/index.pyx"], language="c++", include_dirs=includes),
    Extension("spacy.tagger", ["spacy/tagger.pyx"], language="c++", include_dirs=includes),
-    Extension("spacy.context", ["spacy/context.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.pos_feats", ["spacy/pos_feats.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner._state", ["spacy/ner/_state.pyx"], language="c++", include_dirs=includes),
    #Extension("spacy.ner.bilou_moves", ["spacy/ner/bilou_moves.pyx"], language="c++", include_dirs=includes),
--- a/spacy/context.pxd
+++ b/spacy/context.pxd
@ -1,64 +1 @@
-from thinc.typedefs cimport atom_t
-from .tokens cimport TokenC

-
-cpdef enum:
-    P2_sic
-    P2_cluster
-    P2_shape
-    P2_prefix
-    P2_suffix
-    P2_pos
-    P2_sense
-
-    P1_sic
-    P1_cluster
-    P1_shape
-    P1_prefix
-    P1_suffix
-    P1_pos
-    P1_sense
-
-    W_sic
-    W_cluster
-    W_shape
-    W_prefix
-    W_suffix
-    W_pos
-    W_sense
-
-    N1_sic
-    N1_cluster
-    N1_shape
-    N1_prefix
-    N1_suffix
-    N1_pos
-    N1_sense
-
-    N2_sic
-    N2_cluster
-    N2_shape
-    N2_prefix
-    N2_suffix
-    N2_pos
-    N2_sense
-
-    N_FIELDS
-
-
-cdef inline void fill_context(atom_t* context, const int i, const TokenC* tokens) nogil:
-    _fill_from_token(&context[P2_sic], &tokens[i-2])
-    _fill_from_token(&context[P1_sic], &tokens[i-1])
-    _fill_from_token(&context[W_sic], &tokens[i])
-    _fill_from_token(&context[N1_sic], &tokens[i+1])
-    _fill_from_token(&context[N2_sic], &tokens[i+2])
-
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.sic
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.pos
-    context[6] = t.sense
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -1,5 +1,9 @@
-from spacy.lang cimport Language
-from spacy.tokens cimport Tokens
+from thinc.typedefs cimport atom_t
+
+from .lang cimport Language
+from .tokens cimport Tokens
+from .tokens cimport TokenC
+

 # Flags
 cpdef enum FlagID:
@ -28,5 +32,67 @@ cpdef enum FlagID:
    IN_NAMES


+cpdef enum:
+    P2_sic
+    P2_cluster
+    P2_shape
+    P2_prefix
+    P2_suffix
+    P2_pos
+    P2_sense
+
+    P1_sic
+    P1_cluster
+    P1_shape
+    P1_prefix
+    P1_suffix
+    P1_pos
+    P1_sense
+
+    W_sic
+    W_cluster
+    W_shape
+    W_prefix
+    W_suffix
+    W_pos
+    W_sense
+
+    N1_sic
+    N1_cluster
+    N1_shape
+    N1_prefix
+    N1_suffix
+    N1_pos
+    N1_sense
+
+    N2_sic
+    N2_cluster
+    N2_shape
+    N2_prefix
+    N2_suffix
+    N2_pos
+    N2_sense
+
+    N_CONTEXT_FIELDS
+
+
+cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
+    _fill_from_token(&context[P2_sic], &tokens[i-2])
+    _fill_from_token(&context[P1_sic], &tokens[i-1])
+    _fill_from_token(&context[W_sic], &tokens[i])
+    _fill_from_token(&context[N1_sic], &tokens[i+1])
+    _fill_from_token(&context[N2_sic], &tokens[i+2])
+
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.sic
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.pos
+    context[6] = t.sense
+
+
 cdef class English(Language):
    pass
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -30,11 +30,6 @@ same scheme. Tokenization problems are a major cause of poor performance for
 NLP tools. If you're using a pre-trained model, the :py:mod:`spacy.ptb3` module
 provides a fully Penn Treebank 3-compliant tokenizer.
 '''
-# TODO
-#The script translate_treebank_tokenization can be used to transform a treebank's
-#annotation to use one of the spacy tokenization schemes.
-
-
 from __future__ import unicode_literals

 cimport lang
@ -42,6 +37,32 @@ from .typedefs cimport flags_t
 import orth


+POS_TEMPLATES = (
+    (W_sic,),
+    (P1_sic,),
+    (N1_sic,),
+    (N2_sic,),
+    (P2_sic,),
+
+    (W_suffix,),
+    (W_prefix,),
+
+    (P1_pos,),
+    (P2_pos,),
+    (P1_pos, P2_pos),
+    (P1_pos, W_sic),
+    (P1_suffix,),
+    (N1_suffix,),
+
+    (W_shape,),
+    (W_cluster,),
+    (N1_cluster,),
+    (N2_cluster,),
+    (P1_cluster,),
+    (P2_cluster,),
+)
+
+
 cdef class English(Language):
    """English tokenizer, tightly coupled to lexicon.

@ -49,6 +70,9 @@ cdef class English(Language):
        name (unicode): The two letter code used by Wikipedia for the language.
        lexicon (Lexicon): The lexicon. Exposes the lookup method.
    """
+    def get_props(self, unicode string):
+        return {'flags': self.set_flags(string), 'dense': orth.word_shape(string)}
+
    def set_flags(self, unicode string):
        cdef flags_t flags = 0
        flags |= orth.is_alpha(string) << IS_ALPHA
@ -64,5 +88,22 @@ cdef class English(Language):
        flags |= orth.like_number(string) << LIKE_NUMBER
        return flags

+    def set_pos(self, Tokens tokens):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        for i in range(tokens.length):
+            fill_pos_context(context, i, tokens.data)
+            tokens.data[i].pos = self.pos_tagger.predict(context)
+
+    def train_pos(self, Tokens tokens, golds):
+        cdef int i
+        cdef atom_t[N_CONTEXT_FIELDS] context
+        c = 0
+        for i in range(tokens.length):
+            fill_pos_context(context, i, tokens.data)
+            tokens.data[i].pos = self.pos_tagger.predict(context, [golds[i]])
+            c += tokens.data[i].pos == golds[i]
+        return c
+

 EN = English('en')
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -13,7 +13,7 @@ from .utf8string cimport StringStore, UniStr


 cdef class Lexicon:
-    cpdef public set_flags
+    cpdef public get_lex_props
    cdef Pool mem
    cpdef readonly size_t size
    cpdef readonly StringStore strings
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -37,7 +37,7 @@ cdef class Language:
        self._prefix_re = re.compile(prefix)
        self._suffix_re = re.compile(suffix)
        self._infix_re = re.compile(infix)
-        self.lexicon = Lexicon(self.set_flags)
+        self.lexicon = Lexicon(self.get_props)
        self._load_special_tokenization(rules)
        self.pos_tagger = None

@ -249,13 +249,13 @@ cdef class Lexicon:
    
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self, object set_flags=None):
+    def __init__(self, object get_props):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.size = 2
-        self.set_flags = set_flags
+        self.get_lex_props = get_props

    cdef const Lexeme* get(self, Pool mem, UniStr* string) except NULL:
        '''Get a pointer to a Lexeme from the lexicon, creating a new Lexeme
@ -267,9 +267,10 @@ cdef class Lexicon:
            return lex
        if string.n < 3:
            mem = self.mem
+        cdef unicode py_string = string.chars[:string.n]
        lex = <Lexeme*>mem.alloc(sizeof(Lexeme), 1)
-        lex[0] = lexeme_init(self.size, string.chars[:string.n], string.key,
-                self.strings, {'flags': self.set_flags(string.chars[:string.n])})
+        lex[0] = lexeme_init(self.size, py_string, string.key, self.strings,
+                             self.get_lex_props(py_string))
        if mem is self.mem:
            self._map.set(string.key, lex)
            while self.lexemes.size() < (lex.id + 1):
--- a/spacy/lexeme.pxd
+++ b/spacy/lexeme.pxd
@ -72,17 +72,14 @@ cpdef enum attr_id_t:

    ID
    SIC
-    STEM
    DENSE
    SHAPE
-    ASCIIED
    PREFIX
    SUFFIX

    LENGTH
    CLUSTER
    POS_TYPE
-    SENSE_TYPE


 cdef struct Lexeme:
@ -90,20 +87,16 @@ cdef struct Lexeme:
   
    attr_t id
    attr_t sic
-    attr_t stem
    attr_t dense
    attr_t shape
-    attr_t asciied
    attr_t prefix
    attr_t suffix
 
    attr_t length
    attr_t cluster
    attr_t pos_type
-    attr_t sense_type

    float prob
-    float lower_pc
    float sentiment


@ -127,12 +120,8 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
        return lex.sic
    elif feat_name == DENSE:
        return lex.dense
-    elif feat_name == STEM:
-        return lex.stem
    elif feat_name == SHAPE:
        return lex.shape
-    elif feat_name == ASCIIED:
-        return lex.asciied
    elif feat_name == PREFIX:
        return lex.prefix
    elif feat_name == SUFFIX:
@ -143,7 +132,5 @@ cdef inline attr_t get_attr(const Lexeme* lex, attr_id_t feat_name) nogil:
        return lex.cluster
    elif feat_name == POS_TYPE:
        return lex.pos_type
-    elif feat_name == SENSE_TYPE:
-        return lex.sense_type
    else:
        return 0
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -19,17 +19,12 @@ cpdef Lexeme init(id_t i, unicode string, hash_t hashed,
    
    lex.cluster = props.get('cluster', 0)
    lex.pos_type = props.get('pos_type', 0)
-    lex.sense_type = props.get('sense_type', 0)
    lex.prob = props.get('prob', 0)

-    lex.lower_pc = props.get('lower_pc', 0.0)
-
    lex.prefix = string_store[string[:1]]
    lex.suffix = string_store[string[-3:]]
    lex.shape = string_store[orth.word_shape(string)]
-    lex.dense = lex.sic if lex.prob >= -10 else lex.shape
-    lex.stem = string_store[props.get('stem', string)]
-    lex.asciied = string_store[orth.asciied(string)]
+    lex.dense = string_store[props['dense']]
   
    lex.flags = props.get('flags', 0)
    return lex
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -3,25 +3,17 @@ from cymem.cymem cimport Pool
 from thinc.learner cimport LinearModel
 from thinc.features cimport Extractor
 from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
-from preshed.maps cimport PreshMap

 from .typedefs cimport hash_t
 from .tokens cimport Tokens


-cpdef enum TagType:
-    POS
-    SENSE
-
-
 cdef class Tagger:
-    cpdef int set_tags(self, Tokens tokens) except -1
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=*) except *
+    cdef class_t predict(self, atom_t* context, object golds=*) except *
 
    cpdef readonly Pool mem
    cpdef readonly Extractor extractor
    cpdef readonly LinearModel model

-    cpdef readonly TagType tag_type
    cpdef readonly list tag_names
    cdef dict tagdict
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -2,9 +2,6 @@
 from __future__ import unicode_literals
 from __future__ import division

-from .context cimport fill_context
-from .context cimport N_FIELDS
-
 from os import path
 import os
 import shutil
@ -15,12 +12,11 @@ import cython
 from thinc.features cimport Feature, count_feats


-def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
+def setup_model_dir(tag_names, tag_counts, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
-        'tag_type': tag_type,
        'templates': templates,
        'tag_names': tag_names,
        'tag_counts': tag_counts,
@ -29,29 +25,6 @@ def setup_model_dir(tag_type, tag_names, tag_counts, templates, model_dir):
        json.dump(config, file_)


-def train(train_sents, model_dir, nr_iter=10):
-    cdef Tokens tokens
-    cdef Tagger tagger = Tagger(model_dir)
-    cdef int i
-    cdef class_t guess = 0
-    cdef class_t gold
-    for _ in range(nr_iter):
-        n_corr = 0
-        total = 0
-        for tokens, golds in train_sents:
-            assert len(tokens) == len(golds), [t.string for t in tokens]
-            for i in range(tokens.length):
-                gold = golds[i]
-                guess = tagger.predict(i, tokens, [gold])
-                tokens.set_tag(i, tagger.tag_type, guess)
-                total += 1
-                n_corr += guess == gold
-        print('%.4f' % ((n_corr / total) * 100))
-        random.shuffle(train_sents)
-    tagger.model.end_training()
-    tagger.model.dump(path.join(model_dir, 'model'))
-
-
 cdef class Tagger:
    """Assign part-of-speech, named entity or supersense tags, using greedy
    decoding.  The tagger reads its model and configuration from disk.
@ -61,26 +34,13 @@ cdef class Tagger:
        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        self.tag_names = cfg['tag_names']
-        self.tag_type = cfg['tag_type']
        self.tagdict = _make_tag_dict(cfg['tag_counts'])
        self.extractor = Extractor(templates)
        self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))

-    cpdef int set_tags(self, Tokens tokens) except -1:
-        """Assign tags to a Tokens object.
-
-        >>> tokens = EN.tokenize(u'An example sentence.')
-        >>> assert tokens[0].pos == 'NO_TAG'
-        >>> EN.pos_tagger.set_tags(tokens)
-        >>> assert tokens[0].pos == 'DT'
-        """
-        cdef int i
-        for i in range(tokens.length):
-            tokens.set_tag(i, self.tag_type, self.predict(i, tokens))
-
-    cpdef class_t predict(self, int i, Tokens tokens, object golds=None) except *:
+    cdef class_t predict(self, atom_t* context, object golds=None) except *:
        """Predict the tag of tokens[i].  The tagger remembers the features and
        prediction, in case you later call tell_answer.

@ -88,11 +48,6 @@ cdef class Tagger:
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
-        cdef atom_t sic = tokens.data[i].lex.sic
-        if sic in self.tagdict:
-            return self.tagdict[sic]
-        cdef atom_t[N_FIELDS] context
-        fill_context(context, i, tokens.data)
        cdef int n_feats
        cdef Feature* feats = self.extractor.get_feats(context, &n_feats)
        cdef weight_t* scores = self.model.get_scores(feats, n_feats)
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -140,11 +140,11 @@ cdef class Token:
        self.cluster = lex['cluster']
        self.length = lex['length']
        self.postype = lex['pos_type']
-        self.sensetype = lex['sense_type']
+        self.sensetype = 0
        self.sic = lex['sic']
        self.norm = lex['dense']
        self.shape = lex['shape']
-        self.suffix = lex['asciied']
+        self.suffix = lex['suffix']
        self.prefix = lex['prefix']

        self.prob = lex['prob']