* Generalize tagger code, in preparation for NER and supersense tagging.

2025-12-30 13:33:16 +03:00 · 2014-11-05 03:42:14 +11:00 · 2014-11-05 03:42:14 +11:00 · 3733444101
commit 3733444101
parent 81da61f3cf
12 changed files with 247 additions and 52 deletions
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -6,7 +6,7 @@ from cymem.cymem cimport Pool
 from .typedefs cimport hash_t
 from .tokens cimport Tokens
 from .lexeme cimport Lexeme
-from .pos cimport Tagger as PosTagger
+from .tagger cimport Tagger
 from .utf8string cimport StringStore


@ -41,14 +41,13 @@ cdef class Language:
    cdef PreshMap _specials
    cpdef readonly Lexicon lexicon

-    cpdef readonly PosTagger pos_tagger
+    cpdef readonly Tagger pos_tagger

    cdef object _prefix_re
    cdef object _suffix_re
    cdef object _infix_re

    cpdef Tokens tokenize(self, unicode text)
-    cpdef Tokens pos_tag(self, Tokens t)

    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
    cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -23,7 +23,7 @@ from . import util
 from .util import read_lang_data
 from .tokens import Tokens

-from .pos cimport Tagger as PosTagger
+from .tagger cimport Tagger


 cdef class Language:
@ -42,7 +42,7 @@ cdef class Language:
            self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
        self._load_special_tokenization(rules)
        if path.exists(path.join(util.DATA_DIR, name, 'pos')):
-            self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos'))
+            self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
        else:
            self.pos_tagger = None

@ -93,16 +93,6 @@ cdef class Language:
                self._tokenize(tokens, &span, start, i)
        return tokens

-    cpdef Tokens pos_tag(self, Tokens t):
-        if self.pos_tagger is None:
-            return t
-        cdef int i
-        t.pos[-1] = self.pos_tagger.encode_pos('EOL')
-        t.pos[-2] = self.pos_tagger.encode_pos('EOL')
-        for i in range(t.length):
-            t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
-        return t
-
    cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
        cdef vector[Lexeme*] prefixes
        cdef vector[Lexeme*] suffixes
--- a/spacy/pos.pxd
+++ b/spacy/pos.pxd
@ -1,22 +0,0 @@
-from cymem.cymem cimport Pool
-
-from thinc.learner cimport LinearModel
-from thinc.features cimport Extractor
-from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
-
-from .tokens cimport Tokens
-
-
-cdef class Tagger:
-    cpdef readonly Extractor extractor
-    cpdef readonly LinearModel model
-
-    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0
-    cpdef bint tell_answer(self, class_t gold_tag) except *
- 
-    cdef Pool mem
-    cdef class_t _guess
-    cdef atom_t* _atoms
-    cdef feat_t* _feats
-    cdef weight_t* _values
-    cdef weight_t* _scores
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -30,7 +30,7 @@ cdef class Tagger:
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
-        self.model = LinearModel(len(self.tags), self.extractor.n)
+        self.model = LinearModel(len(self.tags))
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
--- a/spacy/pos_feats.pxd
+++ b/spacy/pos_feats.pxd
@ -0,0 +1,83 @@
+from .tokens cimport Tokens
+from thinc.typedefs cimport atom_t
+
+
+cpdef enum:
+    P2i
+    P2c
+    P2w
+    P2shape
+    P2pref
+    P2suff
+    P2title
+    P2upper
+    P2oft_title
+    P2oft_upper
+    P2pos
+    P2url
+    P2num
+
+    P1i
+    P1c
+    P1w
+    P1shape
+    P1pre
+    P1suff
+    P1title
+    P1upper
+    P1oft_title
+    P1oft_upper
+    P1pos
+    P1url
+    P1num
+
+    N0i
+    N0c
+    N0w
+    N0shape
+    N0pref
+    N0suff
+    N0title
+    N0upper
+    N0oft_title
+    N0oft_upper
+    N0pos
+    N0url
+    N0num
+
+    N1i
+    N1c
+    N1w
+    N1shape
+    N1pref
+    N1suff
+    N1title
+    N1upper
+    N1oft_title
+    N1oft_upper
+    N1pos
+    N1url
+    N1num
+
+    N2i
+    N2c
+    N2w
+    N2shape
+    N2pref
+    N2suff
+    N2title
+    N2upper
+    N2oft_title
+    N2oft_upper
+    N2pos
+    N2url
+    N2num
+
+    P2t
+    P1t
+
+    CONTEXT_SIZE
+
+
+
+cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1
--- a/spacy/pos_feats.pyx
+++ b/spacy/pos_feats.pyx
@ -0,0 +1,77 @@
+from .lexeme cimport *
+
+from thinc.typedefs cimport atom_t
+
+
+TEMPLATES = (
+    (N0i,),
+    (N0w,),
+    (N0suff,),
+    (N0pref,),
+    (P1t,),
+    (P2t,),
+    (P1t, P2t),
+    (P1t, N0w),
+    (P1w,),
+    (P1suff,),
+    (P2w,),
+    (N1w,),
+    (N1suff,),
+    (N2w,),
+
+    (N0shape,),
+    (N0c,),
+    (N1c,),
+    (N2c,),
+    (P1c,),
+    (P2c,),
+    (P1c, N0c),
+    (N0c, N1c),
+    (P1c, P1t),
+    (P1c, P1t, N0c),
+    (P1t, N0c),
+    (N0oft_upper,),
+    (N0oft_title,),
+
+    (P1w, N0w),
+    (N0w, N1w),
+
+    (N0pos,),
+    (P1t, N0pos, N1pos),
+    (P1t, N1pos),
+
+    (N0url,),
+    (N0num,),
+    (P1url,),
+    (P1url,),
+    (N1num,),
+    (N1url,),
+)
+
+
+cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
+    _fill_token(&context[P2i], tokens.lex[i-2])
+    _fill_token(&context[P1i], tokens.lex[i-1])
+    _fill_token(&context[N0i], tokens.lex[i])
+    _fill_token(&context[N1i], tokens.lex[i+1])
+    _fill_token(&context[N2i], tokens.lex[i+2])
+    context[P1t] = tokens.pos[i-1]
+    context[P2t] = tokens.pos[i-2]
+
+
+cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
+    atoms[0] = lex.sic
+    atoms[1] = lex.cluster
+    atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
+    atoms[3] = lex.shape
+    atoms[4] = lex.prefix
+    atoms[5] = lex.suffix
+
+    atoms[6] = lex.flags & (1 << IS_TITLE)
+    atoms[7] = lex.flags & (1 << IS_UPPER)
+    atoms[8] = lex.flags & (1 << OFT_TITLE)
+    atoms[9] = lex.flags & (1 << OFT_UPPER)
+    atoms[10] = lex.postype
+    atoms[11] = lex.flags & (1 << LIKE_URL)
+    atoms[12] = lex.flags & (1 << LIKE_NUMBER)
+
--- a/spacy/pos_util.py
+++ b/spacy/pos_util.py
@ -6,9 +6,10 @@ from .en import EN
 from .pos import Tagger


-def read_gold(file_):
+def read_gold(file_, tag_list):
    paras = file_.read().strip().split('\n\n')
    golds = []
+    tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
    for para in paras:
        if not para.strip():
            continue
@ -32,10 +33,16 @@ def read_gold(file_):
            else:
                conll_toks.pop(0)
        assert len(tags) == len(tokens)
-        tags = [Tagger.encode_pos(t) for t in tags]
+        tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
        golds.append((tokens, tags))
    return golds

+def _encode_pos(tag, tag_ids, tag_list):
+    if tag not in tag_ids:
+        tag_ids[tag] = len(tag_list)
+        tag_list.append(tag)
+    return tag_ids[tag]
+

 def ptb_to_univ(tag):
    mapping = dict(tuple(line.split()) for line in """
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -7,7 +7,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
 from .tokens cimport Tokens


-cdef enum TagType:
+cpdef enum TagType:
    POS
    ENTITY
    SENSE
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,37 +1,93 @@
 # cython: profile=True
+from __future__ import print_function
+from __future__ import unicode_literals
+from __future__ import division
+
 from os import path
 import os
 import shutil
 import random
-import codecs
-import gzip
 import json
 import cython

+from .pos_feats cimport fill_context as pos_fill_context
+from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE

 from thinc.features cimport ConjFeat

+
 NULL_TAG = 0


+def setup_model_dir(tag_type, tag_names, templates, model_dir):
+    if path.exists(model_dir):
+        shutil.rmtree(model_dir)
+    os.mkdir(model_dir)
+    config = {
+        'tag_type': tag_type,
+        'templates': templates,
+        'tag_names': tag_names,
+    }
+    with open(path.join(model_dir, 'config.json'), 'w') as file_:
+        json.dump(config, file_)
+
+
+def train(train_sents, model_dir, nr_iter=5):
+    tagger = Tagger(model_dir)
+    for _ in range(nr_iter):
+        n_corr = 0
+        total = 0
+        for tokens, golds in train_sents:
+            assert len(tokens) == len(golds), [t.string for t in tokens]
+            for i, gold in enumerate(golds):
+                guess = tagger.predict(i, tokens)
+                tokens.set_tag(i, tagger.tag_type, guess)
+                tagger.tell_answer(gold)
+                if gold != NULL_TAG:
+                    total += 1
+                    n_corr += guess == gold
+        print('%.4f' % ((n_corr / total) * 100))
+        random.shuffle(train_sents)
+    tagger.model.end_training()
+    tagger.model.dump(path.join(model_dir, 'model'), freq_thresh=10)
+
+
+def evaluate(tagger, sents):
+    n_corr = 0
+    total = 0
+    for tokens, golds in sents:
+        for i, gold in enumerate(golds):
+            guess = tagger.predict(i, tokens)
+            tokens.set_tag(i, tagger.tag_type, guess)
+            if gold != NULL_TAG:
+                total += 1
+                n_corr += guess == gold
+    return n_corr / total
+
+
 cdef class Tagger:
    """Assign part-of-speech, named entity or supersense tags, using greedy
    decoding.  The tagger reads its model and configuration from disk.
    """
    def __init__(self, model_dir):
        self.mem = Pool()
-        cfg = json.load(path.join(model_dir, 'config.json'))
+        cfg = json.load(open(path.join(model_dir, 'config.json')))
        templates = cfg['templates']
        self.tag_names = cfg['tag_names']
        self.tag_type = cfg['tag_type']
-        self.model = LinearModel(len(self.tag_names))
+        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
+        self.model = LinearModel(len(self.tag_names), self.extractor.n)
+        print("Load tagger model")
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
-        self.extractor = Extractor(templates, [ConjFeat] * len(templates))
+        print("Done")

+        if self.tag_type == POS:
+            n_context = POS_CONTEXT_SIZE
+        self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
-        self._scores = <weight_t*>self.mem.alloc(len(self.cfg.tags), sizeof(weight_t))
+        self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
        self._guess = NULL_TAG

    cpdef int set_tags(self, Tokens tokens) except -1:
@ -54,8 +110,10 @@ cdef class Tagger:
        >>> tag = EN.pos_tagger.predict(0, tokens)
        >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
        """
-        #if self.tag_type == POS:
-        #    _pos_feats.fill_context(self._context, i, tokens)
+        if self.tag_type == POS:
+            pos_fill_context(self._context, i, tokens)
+        else:
+            raise StandardError
        self.extractor.extract(self._feats, self._values, self._context, NULL)
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
 from .lexeme cimport Lexeme
 from .typedefs cimport flag_t
 from .utf8string cimport StringStore
+from .tagger cimport TagType

 from thinc.typedefs cimport atom_t

@ -23,6 +24,7 @@ cdef class Tokens:

    cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
    cdef int push_back(self, int i, Lexeme* lexeme) except -1
+    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1


 cdef class Token:
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -4,6 +4,7 @@ cimport cython

 DEF PADDING = 5

+
 cdef int bounds_check(int i, int length, int padding) except -1:
    if (i + padding) < 0:
        raise IndexError
@ -89,6 +90,9 @@ cdef class Tokens:
                idx = self.push_back(idx, lexemes[i])
        return idx

+    cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
+        self.pos[i] = tag
+
    def _realloc(self, new_size):
        self.max_length = new_size
        n = new_size + (PADDING * 2)
@ -130,4 +134,3 @@ cdef class Token:
                return ''
            cdef bytes utf8string = self._string_store[self.sic]
            return utf8string.decode('utf8')
-
--- a/spacy/typedefs.pxd
+++ b/spacy/typedefs.pxd
@ -6,5 +6,3 @@ ctypedef uint64_t flag_t
 ctypedef uint32_t id_t
 ctypedef uint16_t len_t
 ctypedef uint16_t tag_t
-
-