mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
* Tagger training now working. Still need to test load/save of model. Morphology still broken.
This commit is contained in:
parent
320ced276a
commit
0af139e183
|
@ -1,11 +1,12 @@
|
||||||
{
|
{
|
||||||
".": {"pos": "punc", "punctype": "peri"},
|
".": {"pos": "punct", "puncttype": "peri"},
|
||||||
",": {"pos": "punc", "punctype": "comm"},
|
",": {"pos": "punct", "puncttype": "comm"},
|
||||||
"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"},
|
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
|
||||||
"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"},
|
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
|
||||||
"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"},
|
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
|
||||||
"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"},
|
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||||
":": {"pos": "punc"},
|
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||||
|
":": {"pos": "punct"},
|
||||||
"$": {"pos": "sym", "other": {"symtype": "currency"}},
|
"$": {"pos": "sym", "other": {"symtype": "currency"}},
|
||||||
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
|
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
|
||||||
"AFX": {"pos": "adj", "hyph": "hyph"},
|
"AFX": {"pos": "adj", "hyph": "hyph"},
|
||||||
|
@ -13,15 +14,15 @@
|
||||||
"CD": {"pos": "num", "numtype": "card"},
|
"CD": {"pos": "num", "numtype": "card"},
|
||||||
"DT": {"pos": "adj", "prontype": "prn"},
|
"DT": {"pos": "adj", "prontype": "prn"},
|
||||||
"EX": {"pos": "adv", "advtype": "ex"},
|
"EX": {"pos": "adv", "advtype": "ex"},
|
||||||
"FW": {"foreign": "foreign"},
|
"FW": {"pos": "x", "foreign": "foreign"},
|
||||||
"HYPH": {"pos": "punc", "punctype": "dash"},
|
"HYPH": {"pos": "punct", "puncttype": "dash"},
|
||||||
"IN": {"pos": "adp"},
|
"IN": {"pos": "adp"},
|
||||||
"JJ": {"pos": "adj", "degree": "pos"},
|
"JJ": {"pos": "adj", "degree": "pos"},
|
||||||
"JJR": {"pos": "adj", "degree": "comp"},
|
"JJR": {"pos": "adj", "degree": "comp"},
|
||||||
"JJS": {"pos": "adj", "degree": "sup"},
|
"JJS": {"pos": "adj", "degree": "sup"},
|
||||||
"LS": {"pos": "punc", "numtype": "ord"},
|
"LS": {"pos": "punct", "numtype": "ord"},
|
||||||
"MD": {"pos": "verb", "verbtype": "mod"},
|
"MD": {"pos": "verb", "verbtype": "mod"},
|
||||||
"NIL": {},
|
"NIL": {"pos": "no_tag"},
|
||||||
"NN": {"pos": "noun", "number": "sing"},
|
"NN": {"pos": "noun", "number": "sing"},
|
||||||
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
|
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
|
||||||
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
|
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
|
||||||
|
@ -36,7 +37,7 @@
|
||||||
"RP": {"pos": "part"},
|
"RP": {"pos": "part"},
|
||||||
"SYM": {"pos": "sym"},
|
"SYM": {"pos": "sym"},
|
||||||
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
|
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
|
||||||
"UH": {"pos": "int"},
|
"UH": {"pos": "intJ"},
|
||||||
"VB": {"pos": "verb", "verbform": "inf"},
|
"VB": {"pos": "verb", "verbform": "inf"},
|
||||||
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
|
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
|
||||||
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
|
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
|
||||||
|
@ -47,5 +48,13 @@
|
||||||
"WP": {"pos": "noun", "prontype": "int|rel"},
|
"WP": {"pos": "noun", "prontype": "int|rel"},
|
||||||
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
|
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
|
||||||
"WRB": {"pos": "adv", "prontype": "int|rel"},
|
"WRB": {"pos": "adv", "prontype": "int|rel"},
|
||||||
"SP": {"pos": "space"}
|
"SP": {"pos": "space"},
|
||||||
|
"ADD": {"pos": "x"},
|
||||||
|
"NFP": {"pos": "punct"},
|
||||||
|
"GW": {"pos": "x"},
|
||||||
|
"AFX": {"pos": "x"},
|
||||||
|
"HYPH": {"pos": "punct"},
|
||||||
|
"XX": {"pos": "x"},
|
||||||
|
"BES": {"pos": "verb"},
|
||||||
|
"HVS": {"pos": "verb"},
|
||||||
}
|
}
|
||||||
|
|
|
@ -91,6 +91,8 @@ cdef class Model:
|
||||||
count_feats(counts[guess], feats, n_feats, -cost)
|
count_feats(counts[guess], feats, n_feats, -cost)
|
||||||
self._model.update(counts)
|
self._model.update(counts)
|
||||||
|
|
||||||
def end_training(self):
|
def end_training(self, model_loc=None):
|
||||||
|
if model_loc is None:
|
||||||
|
model_loc = self.model_loc
|
||||||
self._model.end_training()
|
self._model.end_training()
|
||||||
self._model.dump(self.model_loc, freq_thresh=0)
|
self._model.dump(model_loc, freq_thresh=0)
|
||||||
|
|
|
@ -1,5 +1,10 @@
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ujson as json
|
||||||
|
except ImportError:
|
||||||
|
import json
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .morphology import Morphology
|
from .morphology import Morphology
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
|
@ -13,6 +18,8 @@ from . import orth
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
|
|
||||||
|
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -113,14 +120,6 @@ class Language(object):
|
||||||
attrs.IS_OOV: lambda string: True
|
attrs.IS_OOV: lambda string: True
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def default_dep_templates(cls):
|
|
||||||
return []
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def default_ner_templates(cls):
|
|
||||||
return []
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_dep_labels(cls):
|
def default_dep_labels(cls):
|
||||||
return {0: {'ROOT': True}}
|
return {0: {'ROOT': True}}
|
||||||
|
@ -186,10 +185,11 @@ class Language(object):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def default_matcher(cls, vocab, data_dir=None):
|
def default_matcher(cls, vocab, data_dir):
|
||||||
if data_dir is None:
|
if path.exists(data_dir):
|
||||||
data_dir = cls.default_data_dir()
|
return Matcher.from_dir(data_dir, vocab)
|
||||||
return Matcher.from_dir(data_dir, vocab)
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
|
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
|
||||||
parser=None, entity=None, matcher=None, serializer=None):
|
parser=None, entity=None, matcher=None, serializer=None):
|
||||||
|
@ -245,9 +245,9 @@ class Language(object):
|
||||||
def end_training(self, data_dir=None):
|
def end_training(self, data_dir=None):
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = self.data_dir
|
data_dir = self.data_dir
|
||||||
self.parser.model.end_training()
|
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
|
||||||
self.entity.model.end_training()
|
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
|
||||||
self.tagger.model.end_training()
|
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
|
||||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||||
|
|
||||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||||
|
|
|
@ -2,29 +2,41 @@ from __future__ import unicode_literals
|
||||||
from os import path
|
from os import path
|
||||||
import codecs
|
import codecs
|
||||||
|
|
||||||
|
try:
|
||||||
|
import ujson as json
|
||||||
|
except ImportError:
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .parts_of_speech import NOUN, VERB, ADJ
|
||||||
|
|
||||||
|
|
||||||
class Lemmatizer(object):
|
class Lemmatizer(object):
|
||||||
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
|
@classmethod
|
||||||
self.noun_id = noun_id
|
def from_dir(cls, data_dir):
|
||||||
self.verb_id = verb_id
|
index = {}
|
||||||
self.adj_id = adj_id
|
exc = {}
|
||||||
self.index = {}
|
|
||||||
self.exc = {}
|
|
||||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
|
||||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
|
||||||
|
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
|
||||||
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
|
def __init__(self, index, exceptions, rules):
|
||||||
|
self.index = index
|
||||||
|
self.exc = exceptions
|
||||||
|
self.rules = rules
|
||||||
|
|
||||||
def __call__(self, string, pos):
|
def __call__(self, string, pos):
|
||||||
|
if pos == NOUN:
|
||||||
return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
|
pos = 'noun'
|
||||||
if pos == self.noun_id:
|
elif pos == VERB:
|
||||||
return self.noun(string)
|
pos = 'verb'
|
||||||
elif pos == self.verb_id:
|
elif pos == ADJ:
|
||||||
return self.verb(string)
|
pos = 'adj'
|
||||||
elif pos == self.adj_id:
|
|
||||||
return self.adj(string)
|
|
||||||
else:
|
else:
|
||||||
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
|
return string
|
||||||
|
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
|
||||||
|
return min(lemmas)
|
||||||
|
|
||||||
def noun(self, string):
|
def noun(self, string):
|
||||||
return self(string, 'noun')
|
return self(string, 'noun')
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
|
from .strings cimport StringStore
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
cdef readonly object strings
|
||||||
|
cdef public object lemmatizer
|
||||||
cdef public object tag_map
|
cdef public object tag_map
|
||||||
cdef public object tag_names
|
cdef public object tag_names
|
||||||
cdef public object tag_ids
|
cdef public object tag_ids
|
||||||
cdef public int n_tags
|
cdef public int n_tags
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, int tag) except -1
|
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
|
||||||
|
|
||||||
cdef int assign_from_dict(self, TokenC* token, props) except -1
|
cdef int assign_from_dict(self, TokenC* token, props) except -1
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from os import path
|
from os import path
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import ujson as json
|
import ujson as json
|
||||||
|
@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES
|
||||||
|
|
||||||
|
|
||||||
cdef class Morphology:
|
cdef class Morphology:
|
||||||
|
@classmethod
|
||||||
|
def from_dir(cls, data_dir, lemmatizer=None):
|
||||||
|
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||||
|
if lemmatizer is None:
|
||||||
|
lemmatizer = Lemmatizer.from_dir(data_dir)
|
||||||
|
return cls(tag_map, {}, lemmatizer)
|
||||||
|
|
||||||
def __init__(self, tag_map, fused_tokens, lemmatizer):
|
def __init__(self, tag_map, fused_tokens, lemmatizer):
|
||||||
|
self.lemmatizer = lemmatizer
|
||||||
self.tag_map = tag_map
|
self.tag_map = tag_map
|
||||||
self.n_tags = len(tag_map)
|
self.n_tags = len(tag_map)
|
||||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||||
|
@ -17,15 +26,13 @@ cdef class Morphology:
|
||||||
for i, tag_str in enumerate(self.tag_names):
|
for i, tag_str in enumerate(self.tag_names):
|
||||||
self.tag_ids[tag_str] = i
|
self.tag_ids[tag_str] = i
|
||||||
|
|
||||||
@classmethod
|
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1:
|
||||||
def from_dir(cls, data_dir):
|
# TODO Caching
|
||||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
|
||||||
return cls(tag_map, {}, None)
|
|
||||||
|
|
||||||
cdef int assign_tag(self, TokenC* token, int tag) except -1:
|
|
||||||
props = self.tag_map[self.tag_names[tag]]
|
props = self.tag_map[self.tag_names[tag]]
|
||||||
token.pos = UNIV_POS_NAMES[props['pos'].upper()]
|
token.pos = UNIV_POS_NAMES[props['pos'].upper()]
|
||||||
token.tag = tag
|
token.tag = strings[self.tag_names[tag]]
|
||||||
|
lemma = self.lemmatizer(strings[token.lex.orth], token.pos)
|
||||||
|
token.lemma = strings[lemma]
|
||||||
#token.inflection = # TODO
|
#token.inflection = # TODO
|
||||||
|
|
||||||
cdef int assign_from_dict(self, TokenC* token, props) except -1:
|
cdef int assign_from_dict(self, TokenC* token, props) except -1:
|
||||||
|
|
|
@ -2,17 +2,22 @@
|
||||||
cpdef enum univ_pos_t:
|
cpdef enum univ_pos_t:
|
||||||
NO_TAG
|
NO_TAG
|
||||||
ADJ
|
ADJ
|
||||||
ADV
|
|
||||||
ADP
|
ADP
|
||||||
|
ADV
|
||||||
|
AUX
|
||||||
CONJ
|
CONJ
|
||||||
DET
|
DET
|
||||||
|
INTJ
|
||||||
NOUN
|
NOUN
|
||||||
NUM
|
NUM
|
||||||
|
PART
|
||||||
PRON
|
PRON
|
||||||
PRT
|
PROPN
|
||||||
|
PUNCT
|
||||||
|
SCONJ
|
||||||
|
SYM
|
||||||
VERB
|
VERB
|
||||||
X
|
X
|
||||||
PUNCT
|
|
||||||
EOL
|
EOL
|
||||||
SPACE
|
SPACE
|
||||||
N_UNIV_TAGS
|
N_UNIV_TAGS
|
||||||
|
|
|
@ -4,18 +4,22 @@ from __future__ import unicode_literals
|
||||||
UNIV_POS_NAMES = {
|
UNIV_POS_NAMES = {
|
||||||
"NO_TAG": NO_TAG,
|
"NO_TAG": NO_TAG,
|
||||||
"ADJ": ADJ,
|
"ADJ": ADJ,
|
||||||
"ADV": ADV,
|
|
||||||
"ADP": ADP,
|
"ADP": ADP,
|
||||||
|
"ADV": ADV,
|
||||||
|
"AUX": AUX,
|
||||||
"CONJ": CONJ,
|
"CONJ": CONJ,
|
||||||
"DET": DET,
|
"DET": DET,
|
||||||
|
"INTJ": INTJ,
|
||||||
"NOUN": NOUN,
|
"NOUN": NOUN,
|
||||||
"NUM": NUM,
|
"NUM": NUM,
|
||||||
|
"PART": PART,
|
||||||
"PRON": PRON,
|
"PRON": PRON,
|
||||||
"PRT": PRT,
|
"PROPN": PROPN,
|
||||||
|
"PUNCT": PUNCT,
|
||||||
|
"SCONJ": SCONJ,
|
||||||
|
"SYM": SYM,
|
||||||
"VERB": VERB,
|
"VERB": VERB,
|
||||||
"X": X,
|
"X": X,
|
||||||
"PUNCT": PUNCT,
|
"EOL": EOL,
|
||||||
"PUNC": PUNCT,
|
"SPACE": SPACE
|
||||||
"SPACE": SPACE,
|
|
||||||
"EOL": EOL
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,26 +1,12 @@
|
||||||
from preshed.maps cimport PreshMapArray
|
|
||||||
from preshed.counter cimport PreshCounter
|
|
||||||
from cymem.cymem cimport Pool
|
|
||||||
|
|
||||||
from ._ml cimport Model
|
from ._ml cimport Model
|
||||||
from .strings cimport StringStore
|
from .structs cimport TokenC
|
||||||
from .structs cimport TokenC, LexemeC
|
|
||||||
from .parts_of_speech cimport univ_pos_t
|
|
||||||
from .vocab cimport Vocab
|
from .vocab cimport Vocab
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
cdef readonly Pool mem
|
|
||||||
cdef readonly StringStore strings
|
|
||||||
cdef readonly Model model
|
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef public object lemmatizer
|
cdef readonly Model model
|
||||||
cdef PreshMapArray _morph_cache
|
|
||||||
cdef public dict freqs
|
cdef public dict freqs
|
||||||
|
|
||||||
cdef readonly int n_tags
|
|
||||||
|
|
||||||
cdef int predict(self, int i, const TokenC* tokens) except -1
|
cdef int predict(self, int i, const TokenC* tokens) except -1
|
||||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
|
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
|
||||||
#cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
|
||||||
#cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from .typedefs cimport attr_t
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||||
from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
|
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
from ._ml cimport arg_max
|
from ._ml cimport arg_max
|
||||||
|
@ -102,24 +102,10 @@ cdef class Tagger:
|
||||||
(P2_flags,),
|
(P2_flags,),
|
||||||
)
|
)
|
||||||
|
|
||||||
def make_lemmatizer(self):
|
@classmethod
|
||||||
return None
|
def blank(cls, vocab, templates):
|
||||||
|
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||||
def __init__(self, Vocab vocab, templates):
|
return cls(vocab, model)
|
||||||
self.mem = Pool()
|
|
||||||
self.vocab = vocab
|
|
||||||
|
|
||||||
cdef int n_tags = self.vocab.morphology.n_tags + 1
|
|
||||||
|
|
||||||
self.model = Model(n_tags, templates)
|
|
||||||
self.freqs = {TAG: defaultdict(int)}
|
|
||||||
for tag in self.tag_names:
|
|
||||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
|
||||||
self.freqs[TAG][0] = 1
|
|
||||||
|
|
||||||
@property
|
|
||||||
def tag_names(self):
|
|
||||||
return tuple(sorted(self.vocab.morphology.tag_map.keys()))
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dir(cls, data_dir, vocab):
|
def from_dir(cls, data_dir, vocab):
|
||||||
|
@ -127,7 +113,22 @@ cdef class Tagger:
|
||||||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||||
else:
|
else:
|
||||||
templates = cls.default_templates()
|
templates = cls.default_templates()
|
||||||
return cls(vocab, templates)
|
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||||
|
return cls(vocab, model)
|
||||||
|
|
||||||
|
def __init__(self, Vocab vocab, model):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
# TODO: Move this to tag map
|
||||||
|
self.freqs = {TAG: defaultdict(int)}
|
||||||
|
for tag in self.tag_names:
|
||||||
|
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
||||||
|
self.freqs[TAG][0] = 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tag_names(self):
|
||||||
|
return self.vocab.morphology.tag_names
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
@ -142,29 +143,28 @@ cdef class Tagger:
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.data[i].pos == 0:
|
if tokens.data[i].pos == 0:
|
||||||
guess = self.predict(i, tokens.data)
|
guess = self.predict(i, tokens.data)
|
||||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
|
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def train(self, Doc tokens, object gold_tag_strs):
|
def train(self, Doc tokens, object gold_tag_strs):
|
||||||
|
assert len(tokens) == len(gold_tag_strs)
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef int loss
|
cdef int loss
|
||||||
cdef const weight_t* scores
|
cdef const weight_t* scores
|
||||||
golds = [self.tag_names.index(g) if g is not None else -1
|
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
||||||
for g in gold_tag_strs]
|
|
||||||
correct = 0
|
correct = 0
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
guess = self.update(i, tokens.data, golds[i])
|
guess = self.update(i, tokens.data, golds[i])
|
||||||
loss = golds[i] != -1 and guess != golds[i]
|
loss = golds[i] != -1 and guess != golds[i]
|
||||||
|
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
|
||||||
correct += loss == 0
|
correct += loss == 0
|
||||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||||
return correct
|
return correct
|
||||||
|
|
Loading…
Reference in New Issue
Block a user