* Tagger training now working. Still need to test load/save of model. Morphology still broken.

This commit is contained in:
Matthew Honnibal 2015-08-27 09:16:11 +02:00
parent 320ced276a
commit 0af139e183
10 changed files with 134 additions and 106 deletions

View File

@ -1,11 +1,12 @@
{
".": {"pos": "punc", "punctype": "peri"},
",": {"pos": "punc", "punctype": "comm"},
"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"},
"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"},
"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"},
"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"},
":": {"pos": "punc"},
".": {"pos": "punct", "puncttype": "peri"},
",": {"pos": "punct", "puncttype": "comm"},
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
":": {"pos": "punct"},
"$": {"pos": "sym", "other": {"symtype": "currency"}},
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
"AFX": {"pos": "adj", "hyph": "hyph"},
@ -13,15 +14,15 @@
"CD": {"pos": "num", "numtype": "card"},
"DT": {"pos": "adj", "prontype": "prn"},
"EX": {"pos": "adv", "advtype": "ex"},
"FW": {"foreign": "foreign"},
"HYPH": {"pos": "punc", "punctype": "dash"},
"FW": {"pos": "x", "foreign": "foreign"},
"HYPH": {"pos": "punct", "puncttype": "dash"},
"IN": {"pos": "adp"},
"JJ": {"pos": "adj", "degree": "pos"},
"JJR": {"pos": "adj", "degree": "comp"},
"JJS": {"pos": "adj", "degree": "sup"},
"LS": {"pos": "punc", "numtype": "ord"},
"LS": {"pos": "punct", "numtype": "ord"},
"MD": {"pos": "verb", "verbtype": "mod"},
"NIL": {},
"NIL": {"pos": "no_tag"},
"NN": {"pos": "noun", "number": "sing"},
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
@ -36,7 +37,7 @@
"RP": {"pos": "part"},
"SYM": {"pos": "sym"},
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
"UH": {"pos": "int"},
"UH": {"pos": "intJ"},
"VB": {"pos": "verb", "verbform": "inf"},
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
@ -47,5 +48,13 @@
"WP": {"pos": "noun", "prontype": "int|rel"},
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
"WRB": {"pos": "adv", "prontype": "int|rel"},
"SP": {"pos": "space"}
"SP": {"pos": "space"},
"ADD": {"pos": "x"},
"NFP": {"pos": "punct"},
"GW": {"pos": "x"},
"AFX": {"pos": "x"},
"HYPH": {"pos": "punct"},
"XX": {"pos": "x"},
"BES": {"pos": "verb"},
"HVS": {"pos": "verb"},
}

View File

@ -91,6 +91,8 @@ cdef class Model:
count_feats(counts[guess], feats, n_feats, -cost)
self._model.update(counts)
def end_training(self):
def end_training(self, model_loc=None):
if model_loc is None:
model_loc = self.model_loc
self._model.end_training()
self._model.dump(self.model_loc, freq_thresh=0)
self._model.dump(model_loc, freq_thresh=0)

View File

@ -1,5 +1,10 @@
from os import path
try:
import ujson as json
except ImportError:
import json
from .tokenizer import Tokenizer
from .morphology import Morphology
from .vocab import Vocab
@ -13,6 +18,8 @@ from . import orth
from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
class Language(object):
@staticmethod
@ -113,14 +120,6 @@ class Language(object):
attrs.IS_OOV: lambda string: True
}
@classmethod
def default_dep_templates(cls):
return []
@classmethod
def default_ner_templates(cls):
return []
@classmethod
def default_dep_labels(cls):
return {0: {'ROOT': True}}
@ -186,10 +185,11 @@ class Language(object):
return None
@classmethod
def default_matcher(cls, vocab, data_dir=None):
if data_dir is None:
data_dir = cls.default_data_dir()
return Matcher.from_dir(data_dir, vocab)
def default_matcher(cls, vocab, data_dir):
if path.exists(data_dir):
return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None):
@ -245,9 +245,9 @@ class Language(object):
def end_training(self, data_dir=None):
if data_dir is None:
data_dir = self.data_dir
self.parser.model.end_training()
self.entity.model.end_training()
self.tagger.model.end_training()
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:

View File

@ -2,29 +2,41 @@ from __future__ import unicode_literals
from os import path
import codecs
try:
import ujson as json
except ImportError:
import json
from .parts_of_speech import NOUN, VERB, ADJ
class Lemmatizer(object):
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
self.noun_id = noun_id
self.verb_id = verb_id
self.adj_id = adj_id
self.index = {}
self.exc = {}
@classmethod
def from_dir(cls, data_dir):
index = {}
exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']:
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
return cls(index, exc, rules)
def __init__(self, index, exceptions, rules):
self.index = index
self.exc = exceptions
self.rules = rules
def __call__(self, string, pos):
return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
if pos == self.noun_id:
return self.noun(string)
elif pos == self.verb_id:
return self.verb(string)
elif pos == self.adj_id:
return self.adj(string)
if pos == NOUN:
pos = 'noun'
elif pos == VERB:
pos = 'verb'
elif pos == ADJ:
pos = 'adj'
else:
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
return string
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
return min(lemmas)
def noun(self, string):
return self(string, 'noun')

View File

@ -1,13 +1,16 @@
from .structs cimport TokenC
from .strings cimport StringStore
cdef class Morphology:
cdef readonly object strings
cdef public object lemmatizer
cdef public object tag_map
cdef public object tag_names
cdef public object tag_ids
cdef public int n_tags
cdef int assign_tag(self, TokenC* token, int tag) except -1
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
cdef int assign_from_dict(self, TokenC* token, props) except -1

View File

@ -1,4 +1,5 @@
from os import path
from .lemmatizer import Lemmatizer
try:
import ujson as json
@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES
cdef class Morphology:
@classmethod
def from_dir(cls, data_dir, lemmatizer=None):
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
if lemmatizer is None:
lemmatizer = Lemmatizer.from_dir(data_dir)
return cls(tag_map, {}, lemmatizer)
def __init__(self, tag_map, fused_tokens, lemmatizer):
self.lemmatizer = lemmatizer
self.tag_map = tag_map
self.n_tags = len(tag_map)
self.tag_names = tuple(sorted(tag_map.keys()))
@ -17,15 +26,13 @@ cdef class Morphology:
for i, tag_str in enumerate(self.tag_names):
self.tag_ids[tag_str] = i
@classmethod
def from_dir(cls, data_dir):
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
return cls(tag_map, {}, None)
cdef int assign_tag(self, TokenC* token, int tag) except -1:
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1:
# TODO Caching
props = self.tag_map[self.tag_names[tag]]
token.pos = UNIV_POS_NAMES[props['pos'].upper()]
token.tag = tag
token.tag = strings[self.tag_names[tag]]
lemma = self.lemmatizer(strings[token.lex.orth], token.pos)
token.lemma = strings[lemma]
#token.inflection = # TODO
cdef int assign_from_dict(self, TokenC* token, props) except -1:

View File

@ -2,17 +2,22 @@
cpdef enum univ_pos_t:
NO_TAG
ADJ
ADV
ADP
ADV
AUX
CONJ
DET
INTJ
NOUN
NUM
PART
PRON
PRT
PROPN
PUNCT
SCONJ
SYM
VERB
X
PUNCT
EOL
SPACE
N_UNIV_TAGS

View File

@ -4,18 +4,22 @@ from __future__ import unicode_literals
UNIV_POS_NAMES = {
"NO_TAG": NO_TAG,
"ADJ": ADJ,
"ADV": ADV,
"ADP": ADP,
"ADV": ADV,
"AUX": AUX,
"CONJ": CONJ,
"DET": DET,
"INTJ": INTJ,
"NOUN": NOUN,
"NUM": NUM,
"PART": PART,
"PRON": PRON,
"PRT": PRT,
"PROPN": PROPN,
"PUNCT": PUNCT,
"SCONJ": SCONJ,
"SYM": SYM,
"VERB": VERB,
"X": X,
"PUNCT": PUNCT,
"PUNC": PUNCT,
"SPACE": SPACE,
"EOL": EOL
"EOL": EOL,
"SPACE": SPACE
}

View File

@ -1,26 +1,12 @@
from preshed.maps cimport PreshMapArray
from preshed.counter cimport PreshCounter
from cymem.cymem cimport Pool
from ._ml cimport Model
from .strings cimport StringStore
from .structs cimport TokenC, LexemeC
from .parts_of_speech cimport univ_pos_t
from .structs cimport TokenC
from .vocab cimport Vocab
cdef class Tagger:
cdef readonly Pool mem
cdef readonly StringStore strings
cdef readonly Model model
cdef readonly Vocab vocab
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef readonly Model model
cdef public dict freqs
cdef readonly int n_tags
cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
#cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
#cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -8,7 +8,7 @@ from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from ._ml cimport arg_max
@ -102,24 +102,10 @@ cdef class Tagger:
(P2_flags,),
)
def make_lemmatizer(self):
return None
def __init__(self, Vocab vocab, templates):
self.mem = Pool()
self.vocab = vocab
cdef int n_tags = self.vocab.morphology.n_tags + 1
self.model = Model(n_tags, templates)
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return tuple(sorted(self.vocab.morphology.tag_map.keys()))
@classmethod
def blank(cls, vocab, templates):
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
return cls(vocab, model)
@classmethod
def from_dir(cls, data_dir, vocab):
@ -127,7 +113,22 @@ cdef class Tagger:
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
return cls(vocab, templates)
model = Model(vocab.morphology.n_tags, templates, data_dir)
return cls(vocab, model)
def __init__(self, Vocab vocab, model):
self.vocab = vocab
self.model = model
# TODO: Move this to tag map
self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names:
self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1
@property
def tag_names(self):
return self.vocab.morphology.tag_names
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
@ -142,29 +143,28 @@ cdef class Tagger:
for i in range(tokens.length):
if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data)
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Doc tokens, object gold_tag_strs):
assert len(tokens) == len(gold_tag_strs)
cdef int i
cdef int loss
cdef const weight_t* scores
golds = [self.tag_names.index(g) if g is not None else -1
for g in gold_tag_strs]
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
correct = 0
for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i]
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1
return correct