* Tagger training now working. Still need to test load/save of model. Morphology still broken.

This commit is contained in:
Matthew Honnibal 2015-08-27 09:16:11 +02:00
parent 320ced276a
commit 0af139e183
10 changed files with 134 additions and 106 deletions

View File

@ -1,11 +1,12 @@
{ {
".": {"pos": "punc", "punctype": "peri"}, ".": {"pos": "punct", "puncttype": "peri"},
",": {"pos": "punc", "punctype": "comm"}, ",": {"pos": "punct", "puncttype": "comm"},
"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"}, "-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"}, "-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"}, "``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"}, "\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
":": {"pos": "punc"}, "''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
":": {"pos": "punct"},
"$": {"pos": "sym", "other": {"symtype": "currency"}}, "$": {"pos": "sym", "other": {"symtype": "currency"}},
"#": {"pos": "sym", "other": {"symtype": "numbersign"}}, "#": {"pos": "sym", "other": {"symtype": "numbersign"}},
"AFX": {"pos": "adj", "hyph": "hyph"}, "AFX": {"pos": "adj", "hyph": "hyph"},
@ -13,15 +14,15 @@
"CD": {"pos": "num", "numtype": "card"}, "CD": {"pos": "num", "numtype": "card"},
"DT": {"pos": "adj", "prontype": "prn"}, "DT": {"pos": "adj", "prontype": "prn"},
"EX": {"pos": "adv", "advtype": "ex"}, "EX": {"pos": "adv", "advtype": "ex"},
"FW": {"foreign": "foreign"}, "FW": {"pos": "x", "foreign": "foreign"},
"HYPH": {"pos": "punc", "punctype": "dash"}, "HYPH": {"pos": "punct", "puncttype": "dash"},
"IN": {"pos": "adp"}, "IN": {"pos": "adp"},
"JJ": {"pos": "adj", "degree": "pos"}, "JJ": {"pos": "adj", "degree": "pos"},
"JJR": {"pos": "adj", "degree": "comp"}, "JJR": {"pos": "adj", "degree": "comp"},
"JJS": {"pos": "adj", "degree": "sup"}, "JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punc", "numtype": "ord"}, "LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"}, "MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {}, "NIL": {"pos": "no_tag"},
"NN": {"pos": "noun", "number": "sing"}, "NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
@ -36,7 +37,7 @@
"RP": {"pos": "part"}, "RP": {"pos": "part"},
"SYM": {"pos": "sym"}, "SYM": {"pos": "sym"},
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, "TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
"UH": {"pos": "int"}, "UH": {"pos": "intJ"},
"VB": {"pos": "verb", "verbform": "inf"}, "VB": {"pos": "verb", "verbform": "inf"},
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, "VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, "VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
@ -47,5 +48,13 @@
"WP": {"pos": "noun", "prontype": "int|rel"}, "WP": {"pos": "noun", "prontype": "int|rel"},
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, "WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
"WRB": {"pos": "adv", "prontype": "int|rel"}, "WRB": {"pos": "adv", "prontype": "int|rel"},
"SP": {"pos": "space"} "SP": {"pos": "space"},
"ADD": {"pos": "x"},
"NFP": {"pos": "punct"},
"GW": {"pos": "x"},
"AFX": {"pos": "x"},
"HYPH": {"pos": "punct"},
"XX": {"pos": "x"},
"BES": {"pos": "verb"},
"HVS": {"pos": "verb"},
} }

View File

@ -91,6 +91,8 @@ cdef class Model:
count_feats(counts[guess], feats, n_feats, -cost) count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts) self._model.update(counts)
def end_training(self): def end_training(self, model_loc=None):
if model_loc is None:
model_loc = self.model_loc
self._model.end_training() self._model.end_training()
self._model.dump(self.model_loc, freq_thresh=0) self._model.dump(model_loc, freq_thresh=0)

View File

@ -1,5 +1,10 @@
from os import path from os import path
try:
import ujson as json
except ImportError:
import json
from .tokenizer import Tokenizer from .tokenizer import Tokenizer
from .morphology import Morphology from .morphology import Morphology
from .vocab import Vocab from .vocab import Vocab
@ -13,6 +18,8 @@ from . import orth
from .syntax.ner import BiluoPushDown from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
class Language(object): class Language(object):
@staticmethod @staticmethod
@ -113,14 +120,6 @@ class Language(object):
attrs.IS_OOV: lambda string: True attrs.IS_OOV: lambda string: True
} }
@classmethod
def default_dep_templates(cls):
return []
@classmethod
def default_ner_templates(cls):
return []
@classmethod @classmethod
def default_dep_labels(cls): def default_dep_labels(cls):
return {0: {'ROOT': True}} return {0: {'ROOT': True}}
@ -186,10 +185,11 @@ class Language(object):
return None return None
@classmethod @classmethod
def default_matcher(cls, vocab, data_dir=None): def default_matcher(cls, vocab, data_dir):
if data_dir is None: if path.exists(data_dir):
data_dir = cls.default_data_dir()
return Matcher.from_dir(data_dir, vocab) return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None): parser=None, entity=None, matcher=None, serializer=None):
@ -245,9 +245,9 @@ class Language(object):
def end_training(self, data_dir=None): def end_training(self, data_dir=None):
if data_dir is None: if data_dir is None:
data_dir = self.data_dir data_dir = self.data_dir
self.parser.model.end_training() self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training() self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training() self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:

View File

@ -2,29 +2,41 @@ from __future__ import unicode_literals
from os import path from os import path
import codecs import codecs
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ
class Lemmatizer(object): class Lemmatizer(object):
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): @classmethod
self.noun_id = noun_id def from_dir(cls, data_dir):
self.verb_id = verb_id index = {}
self.adj_id = adj_id exc = {}
self.index = {}
self.exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']: for pos in ['adj', 'adv', 'noun', 'verb']:
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
self.index = index
self.exc = exceptions
self.rules = rules
def __call__(self, string, pos): def __call__(self, string, pos):
if pos == NOUN:
return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) pos = 'noun'
if pos == self.noun_id: elif pos == VERB:
return self.noun(string) pos = 'verb'
elif pos == self.verb_id: elif pos == ADJ:
return self.verb(string) pos = 'adj'
elif pos == self.adj_id:
return self.adj(string)
else: else:
raise Exception("Cannot lemmatize with unknown pos: %s" % pos) return string
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
return min(lemmas)
def noun(self, string): def noun(self, string):
return self(string, 'noun') return self(string, 'noun')

View File

@ -1,13 +1,16 @@
from .structs cimport TokenC from .structs cimport TokenC
from .strings cimport StringStore
cdef class Morphology: cdef class Morphology:
cdef readonly object strings
cdef public object lemmatizer
cdef public object tag_map cdef public object tag_map
cdef public object tag_names cdef public object tag_names
cdef public object tag_ids cdef public object tag_ids
cdef public int n_tags cdef public int n_tags
cdef int assign_tag(self, TokenC* token, int tag) except -1 cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
cdef int assign_from_dict(self, TokenC* token, props) except -1 cdef int assign_from_dict(self, TokenC* token, props) except -1

View File

@ -1,4 +1,5 @@
from os import path from os import path
from .lemmatizer import Lemmatizer
try: try:
import ujson as json import ujson as json
@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES
cdef class Morphology: cdef class Morphology:
@classmethod
def from_dir(cls, data_dir, lemmatizer=None):
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
if lemmatizer is None:
lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer)
def __init__(self, tag_map, fused_tokens, lemmatizer): def __init__(self, tag_map, fused_tokens, lemmatizer):
self.lemmatizer = lemmatizer
self.tag_map = tag_map self.tag_map = tag_map
self.n_tags = len(tag_map) self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys())) self.tag_names = tuple(sorted(tag_map.keys()))
@ -17,15 +26,13 @@ cdef class Morphology:
for i, tag_str in enumerate(self.tag_names): for i, tag_str in enumerate(self.tag_names):
self.tag_ids[tag_str] = i self.tag_ids[tag_str] = i
@classmethod cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1:
def from_dir(cls, data_dir): # TODO Caching
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
return cls(tag_map, {}, None)
cdef int assign_tag(self, TokenC* token, int tag) except -1:
props = self.tag_map[self.tag_names[tag]] props = self.tag_map[self.tag_names[tag]]
token.pos = UNIV_POS_NAMES[props['pos'].upper()] token.pos = UNIV_POS_NAMES[props['pos'].upper()]
token.tag = tag token.tag = strings[self.tag_names[tag]]
lemma = self.lemmatizer(strings[token.lex.orth], token.pos)
token.lemma = strings[lemma]
#token.inflection = # TODO #token.inflection = # TODO
cdef int assign_from_dict(self, TokenC* token, props) except -1: cdef int assign_from_dict(self, TokenC* token, props) except -1:

View File

@ -2,17 +2,22 @@
cpdef enum univ_pos_t: cpdef enum univ_pos_t:
NO_TAG NO_TAG
ADJ ADJ
ADV
ADP ADP
ADV
AUX
CONJ CONJ
DET DET
INTJ
NOUN NOUN
NUM NUM
PART
PRON PRON
PRT PROPN
PUNCT
SCONJ
SYM
VERB VERB
X X
PUNCT
EOL EOL
SPACE SPACE
N_UNIV_TAGS N_UNIV_TAGS

View File

@ -4,18 +4,22 @@ from __future__ import unicode_literals
UNIV_POS_NAMES = { UNIV_POS_NAMES = {
"NO_TAG": NO_TAG, "NO_TAG": NO_TAG,
"ADJ": ADJ, "ADJ": ADJ,
"ADV": ADV,
"ADP": ADP, "ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ, "CONJ": CONJ,
"DET": DET, "DET": DET,
"INTJ": INTJ,
"NOUN": NOUN, "NOUN": NOUN,
"NUM": NUM, "NUM": NUM,
"PART": PART,
"PRON": PRON, "PRON": PRON,
"PRT": PRT, "PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB, "VERB": VERB,
"X": X, "X": X,
"PUNCT": PUNCT, "EOL": EOL,
"PUNC": PUNCT, "SPACE": SPACE
"SPACE": SPACE,
"EOL": EOL
} }

View File

@ -1,26 +1,12 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from ._ml cimport Model from ._ml cimport Model
from .strings cimport StringStore from .structs cimport TokenC
from .structs cimport TokenC, LexemeC
from .parts_of_speech cimport univ_pos_t
from .vocab cimport Vocab from .vocab cimport Vocab
cdef class Tagger: cdef class Tagger:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef readonly Model model
cdef readonly Vocab vocab cdef readonly Vocab vocab
cdef public object lemmatizer cdef readonly Model model
cdef PreshMapArray _morph_cache
cdef public dict freqs cdef public dict freqs
cdef readonly int n_tags
cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1
#cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
#cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -8,7 +8,7 @@ from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport * from .attrs cimport *
from ._ml cimport arg_max from ._ml cimport arg_max
@ -102,24 +102,10 @@ cdef class Tagger:
(P2_flags,), (P2_flags,),
) )
def make_lemmatizer(self): @classmethod
return None def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
def __init__(self, Vocab vocab, templates): return cls(vocab, model)
self.mem = Pool()
self.vocab = vocab
cdef int n_tags = self.vocab.morphology.n_tags + 1
self.model = Model(n_tags, templates)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return tuple(sorted(self.vocab.morphology.tag_map.keys()))
@classmethod @classmethod
def from_dir(cls, data_dir, vocab): def from_dir(cls, data_dir, vocab):
@ -127,7 +113,22 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json'))) templates = json.loads(open(path.join(data_dir, 'templates.json')))
else: else:
templates = cls.default_templates() templates = cls.default_templates()
return cls(vocab, templates) model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
self.vocab = vocab
self.model = model
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return self.vocab.morphology.tag_names
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.
@ -142,29 +143,28 @@ cdef class Tagger:
for i in range(tokens.length): for i in range(tokens.length):
if tokens.data[i].pos == 0: if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data) guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(&tokens.data[i], guess) self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs): def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i cdef int i
for i in range(tokens.length): for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs): def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
cdef int i cdef int i
cdef int loss cdef int loss
cdef const weight_t* scores cdef const weight_t* scores
golds = [self.tag_names.index(g) if g is not None else -1 golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
for g in gold_tag_strs]
correct = 0 correct = 0
for i in range(tokens.length): for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i]) guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i] loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0 correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1 self.freqs[TAG][tokens.data[i].tag] += 1
return correct return correct