* Tagger training now working. Still need to test load/save of model. Morphology still broken.

2025-09-19 10:32:40 +03:00 · 2015-08-27 09:16:11 +02:00 · 2015-08-27 09:16:11 +02:00 · 0af139e183
commit 0af139e183
parent 320ced276a
10 changed files with 134 additions and 106 deletions
--- a/lang_data/en/tag_map.json
+++ b/lang_data/en/tag_map.json
@ -1,11 +1,12 @@
 {
-".": {"pos": "punc", "punctype": "peri"},
-",": {"pos": "punc", "punctype": "comm"},
-"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"},
-"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"},
-"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"},
-"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"},
-":": {"pos": "punc"},
+".": {"pos": "punct", "puncttype": "peri"},
+",": {"pos": "punct", "puncttype": "comm"},
+"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
+"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
+"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
+"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
+"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
+":": {"pos": "punct"},
 "$": {"pos": "sym", "other": {"symtype": "currency"}},
 "#": {"pos": "sym", "other": {"symtype": "numbersign"}},
 "AFX": {"pos": "adj",  "hyph": "hyph"},
@ -13,15 +14,15 @@
 "CD": {"pos": "num", "numtype": "card"},
 "DT": {"pos": "adj", "prontype": "prn"},
 "EX": {"pos": "adv", "advtype": "ex"},
-"FW": {"foreign": "foreign"},
-"HYPH": {"pos": "punc", "punctype": "dash"},
+"FW": {"pos": "x", "foreign": "foreign"},
+"HYPH": {"pos": "punct", "puncttype": "dash"},
 "IN": {"pos": "adp"},
 "JJ": {"pos": "adj", "degree": "pos"},
 "JJR": {"pos": "adj", "degree": "comp"},
 "JJS": {"pos": "adj", "degree": "sup"},
-"LS": {"pos": "punc", "numtype": "ord"},
+"LS": {"pos": "punct", "numtype": "ord"},
 "MD": {"pos": "verb", "verbtype": "mod"},
-"NIL": {},
+"NIL": {"pos": "no_tag"},
 "NN": {"pos": "noun", "number": "sing"},
 "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
 "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
@ -36,7 +37,7 @@
 "RP": {"pos": "part"},
 "SYM": {"pos": "sym"},
 "TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
-"UH": {"pos": "int"},
+"UH": {"pos": "intJ"},
 "VB": {"pos": "verb", "verbform": "inf"},
 "VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
 "VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
@ -47,5 +48,13 @@
 "WP": {"pos": "noun", "prontype": "int|rel"},
 "WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
 "WRB": {"pos": "adv", "prontype": "int|rel"},
-"SP": {"pos": "space"}
+"SP": {"pos": "space"},
+"ADD": {"pos": "x"},
+"NFP": {"pos": "punct"},
+"GW": {"pos": "x"},
+"AFX": {"pos": "x"},
+"HYPH": {"pos": "punct"},
+"XX": {"pos": "x"},
+"BES": {"pos": "verb"},
+"HVS": {"pos": "verb"},
 }
--- a/spacy/_ml.pyx
+++ b/spacy/_ml.pyx
@ -91,6 +91,8 @@ cdef class Model:
            count_feats(counts[guess], feats, n_feats, -cost)
            self._model.update(counts)

-    def end_training(self):
+    def end_training(self, model_loc=None):
+        if model_loc is None:
+            model_loc = self.model_loc
        self._model.end_training()
-        self._model.dump(self.model_loc, freq_thresh=0)
+        self._model.dump(model_loc, freq_thresh=0)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,5 +1,10 @@
 from os import path

+try:
+    import ujson as json
+except ImportError:
+    import json
+
 from .tokenizer import Tokenizer
 from .morphology import Morphology
 from .vocab import Vocab
@ -13,6 +18,8 @@ from . import orth
 from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager

+from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
+

 class Language(object):
    @staticmethod
@ -113,14 +120,6 @@ class Language(object):
            attrs.IS_OOV: lambda string: True
        }

-    @classmethod
-    def default_dep_templates(cls):
-        return []
-
-    @classmethod
-    def default_ner_templates(cls):
-        return []
-
    @classmethod
    def default_dep_labels(cls):
        return {0: {'ROOT': True}}
@ -186,10 +185,11 @@ class Language(object):
            return None

    @classmethod
-    def default_matcher(cls, vocab, data_dir=None):
-        if data_dir is None:
-            data_dir = cls.default_data_dir()
-        return Matcher.from_dir(data_dir, vocab)
+    def default_matcher(cls, vocab, data_dir):
+        if path.exists(data_dir):
+            return Matcher.from_dir(data_dir, vocab)
+        else:
+            return None

    def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
                 parser=None, entity=None, matcher=None, serializer=None):
@ -245,9 +245,9 @@ class Language(object):
    def end_training(self, data_dir=None):
        if data_dir is None:
            data_dir = self.data_dir
-        self.parser.model.end_training()
-        self.entity.model.end_training()
-        self.tagger.model.end_training()
+        self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
+        self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
+        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
        self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))

        with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -2,29 +2,41 @@ from __future__ import unicode_literals
 from os import path
 import codecs

+try:
+    import ujson as json
+except ImportError:
+    import json
+
+from .parts_of_speech import NOUN, VERB, ADJ
+

 class Lemmatizer(object):
-    def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
-        self.noun_id = noun_id
-        self.verb_id = verb_id
-        self.adj_id = adj_id
-        self.index = {}
-        self.exc = {}
+    @classmethod
+    def from_dir(cls, data_dir):
+        index = {}
+        exc = {}
        for pos in ['adj', 'adv', 'noun', 'verb']:
-            self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
-            self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
+            index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
+            exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
+        rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
+        return cls(index, exc, rules)
+
+    def __init__(self, index, exceptions, rules):
+        self.index = index
+        self.exc = exceptions
+        self.rules = rules

    def __call__(self, string, pos):
-
-        return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
-        if pos == self.noun_id:
-            return self.noun(string)
-        elif pos == self.verb_id:
-            return self.verb(string)
-        elif pos == self.adj_id:
-            return self.adj(string)
+        if pos == NOUN:
+            pos = 'noun'
+        elif pos == VERB:
+            pos = 'verb'
+        elif pos == ADJ:
+            pos = 'adj'
        else:
-            raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
+            return string
+        lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
+        return min(lemmas)

    def noun(self, string):
        return self(string, 'noun')
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,13 +1,16 @@
 from .structs cimport TokenC
+from .strings cimport StringStore


 cdef class Morphology:
+    cdef readonly object strings
+    cdef public object lemmatizer
    cdef public object tag_map
    cdef public object tag_names
    cdef public object tag_ids
    cdef public int n_tags

-    cdef int assign_tag(self, TokenC* token, int tag) except -1
+    cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1

    cdef int assign_from_dict(self, TokenC* token, props) except -1

--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,4 +1,5 @@
 from os import path
+from .lemmatizer import Lemmatizer

 try:
    import ujson as json
@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES
    

 cdef class Morphology:
+    @classmethod
+    def from_dir(cls, data_dir, lemmatizer=None):
+        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
+        if lemmatizer is None:
+            lemmatizer = Lemmatizer.from_dir(data_dir)
+        return cls(tag_map, {}, lemmatizer)
+
    def __init__(self, tag_map, fused_tokens, lemmatizer):
+        self.lemmatizer = lemmatizer
        self.tag_map = tag_map
        self.n_tags = len(tag_map)
        self.tag_names = tuple(sorted(tag_map.keys()))
@ -17,15 +26,13 @@ cdef class Morphology:
        for i, tag_str in enumerate(self.tag_names):
            self.tag_ids[tag_str] = i

-    @classmethod
-    def from_dir(cls, data_dir):
-        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
-        return cls(tag_map, {}, None)
-
-    cdef int assign_tag(self, TokenC* token, int tag) except -1:
+    cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1:
+        # TODO Caching
        props = self.tag_map[self.tag_names[tag]]
        token.pos = UNIV_POS_NAMES[props['pos'].upper()]
-        token.tag = tag
+        token.tag = strings[self.tag_names[tag]]
+        lemma = self.lemmatizer(strings[token.lex.orth], token.pos)
+        token.lemma = strings[lemma]
        #token.inflection = # TODO

    cdef int assign_from_dict(self, TokenC* token, props) except -1:
--- a/spacy/parts_of_speech.pxd
+++ b/spacy/parts_of_speech.pxd
@ -2,17 +2,22 @@
 cpdef enum univ_pos_t:
    NO_TAG
    ADJ
-    ADV
    ADP
+    ADV
+    AUX
    CONJ
    DET
+    INTJ
    NOUN
    NUM
+    PART
    PRON
-    PRT
+    PROPN
+    PUNCT
+    SCONJ
+    SYM
    VERB
    X
-    PUNCT
    EOL
    SPACE
    N_UNIV_TAGS
--- a/spacy/parts_of_speech.pyx
+++ b/spacy/parts_of_speech.pyx
@ -4,18 +4,22 @@ from __future__ import unicode_literals
 UNIV_POS_NAMES = {
    "NO_TAG": NO_TAG,
    "ADJ": ADJ,
-    "ADV": ADV,
    "ADP": ADP,
+    "ADV": ADV,
+    "AUX": AUX,
    "CONJ": CONJ,
    "DET": DET,
+    "INTJ": INTJ,
    "NOUN": NOUN,
    "NUM": NUM,
+    "PART": PART,
    "PRON": PRON,
-    "PRT": PRT,
+    "PROPN": PROPN,
+    "PUNCT": PUNCT,
+    "SCONJ": SCONJ,
+    "SYM": SYM,
    "VERB": VERB,
    "X": X,
-    "PUNCT": PUNCT,
-    "PUNC": PUNCT,
-    "SPACE": SPACE,
-    "EOL": EOL
+    "EOL": EOL,
+    "SPACE": SPACE
 }
--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -1,26 +1,12 @@
-from preshed.maps cimport PreshMapArray
-from preshed.counter cimport PreshCounter
-from cymem.cymem cimport Pool
-
 from ._ml cimport Model
-from .strings cimport StringStore
-from .structs cimport TokenC, LexemeC
-from .parts_of_speech cimport univ_pos_t
+from .structs cimport TokenC
 from .vocab cimport Vocab


 cdef class Tagger:
-    cdef readonly Pool mem
-    cdef readonly StringStore strings
-    cdef readonly Model model
    cdef readonly Vocab vocab
-    cdef public object lemmatizer
-    cdef PreshMapArray _morph_cache
+    cdef readonly Model model
    cdef public dict freqs

-    cdef readonly int n_tags
-
    cdef int predict(self, int i, const TokenC* tokens) except -1
    cdef int update(self, int i, const TokenC* tokens, int gold) except -1
-    #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
-    #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -8,7 +8,7 @@ from .typedefs cimport attr_t
 from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
-from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
+from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE

 from .attrs cimport *
 from ._ml cimport arg_max
@ -102,24 +102,10 @@ cdef class Tagger:
            (P2_flags,),
        )

-    def make_lemmatizer(self):
-        return None
-
-    def __init__(self, Vocab vocab, templates):
-        self.mem = Pool()
-        self.vocab = vocab
-        
-        cdef int n_tags = self.vocab.morphology.n_tags + 1
-
-        self.model = Model(n_tags, templates)
-        self.freqs = {TAG: defaultdict(int)}
-        for tag in self.tag_names:
-            self.freqs[TAG][self.vocab.strings[tag]] = 1
-        self.freqs[TAG][0] = 1
-
-    @property
-    def tag_names(self):
-        return tuple(sorted(self.vocab.morphology.tag_map.keys()))
+    @classmethod
+    def blank(cls, vocab, templates):
+        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
+        return cls(vocab, model)

    @classmethod
    def from_dir(cls, data_dir, vocab):
@ -127,7 +113,22 @@ cdef class Tagger:
            templates = json.loads(open(path.join(data_dir, 'templates.json')))
        else:
            templates = cls.default_templates()
-        return cls(vocab, templates)
+        model = Model(vocab.morphology.n_tags, templates, data_dir)
+        return cls(vocab, model)
+
+    def __init__(self, Vocab vocab, model):
+        self.vocab = vocab
+        self.model = model
+        
+        # TODO: Move this to tag map
+        self.freqs = {TAG: defaultdict(int)}
+        for tag in self.tag_names:
+            self.freqs[TAG][self.vocab.strings[tag]] = 1
+        self.freqs[TAG][0] = 1
+
+    @property
+    def tag_names(self):
+        return self.vocab.morphology.tag_names

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.
@ -142,29 +143,28 @@ cdef class Tagger:
        for i in range(tokens.length):
            if tokens.data[i].pos == 0:
                guess = self.predict(i, tokens.data)
-                self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+                self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
-            self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
+            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def train(self, Doc tokens, object gold_tag_strs):
+        assert len(tokens) == len(gold_tag_strs)
        cdef int i
        cdef int loss
        cdef const weight_t* scores
-        golds = [self.tag_names.index(g) if g is not None else -1
-                 for g in gold_tag_strs]
+        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
        correct = 0
        for i in range(tokens.length):
            guess = self.update(i, tokens.data, golds[i])
            loss = golds[i] != -1 and guess != golds[i]
-
-            self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
            correct += loss == 0
            self.freqs[TAG][tokens.data[i].tag] += 1
        return correct