mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Tagger training now working. Still need to test load/save of model. Morphology still broken.
This commit is contained in:
parent
320ced276a
commit
0af139e183
|
@ -1,11 +1,12 @@
|
|||
{
|
||||
".": {"pos": "punc", "punctype": "peri"},
|
||||
",": {"pos": "punc", "punctype": "comm"},
|
||||
"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"},
|
||||
"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"},
|
||||
"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"},
|
||||
"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"},
|
||||
":": {"pos": "punc"},
|
||||
".": {"pos": "punct", "puncttype": "peri"},
|
||||
",": {"pos": "punct", "puncttype": "comm"},
|
||||
"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"},
|
||||
"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"},
|
||||
"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"},
|
||||
"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||
"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"},
|
||||
":": {"pos": "punct"},
|
||||
"$": {"pos": "sym", "other": {"symtype": "currency"}},
|
||||
"#": {"pos": "sym", "other": {"symtype": "numbersign"}},
|
||||
"AFX": {"pos": "adj", "hyph": "hyph"},
|
||||
|
@ -13,15 +14,15 @@
|
|||
"CD": {"pos": "num", "numtype": "card"},
|
||||
"DT": {"pos": "adj", "prontype": "prn"},
|
||||
"EX": {"pos": "adv", "advtype": "ex"},
|
||||
"FW": {"foreign": "foreign"},
|
||||
"HYPH": {"pos": "punc", "punctype": "dash"},
|
||||
"FW": {"pos": "x", "foreign": "foreign"},
|
||||
"HYPH": {"pos": "punct", "puncttype": "dash"},
|
||||
"IN": {"pos": "adp"},
|
||||
"JJ": {"pos": "adj", "degree": "pos"},
|
||||
"JJR": {"pos": "adj", "degree": "comp"},
|
||||
"JJS": {"pos": "adj", "degree": "sup"},
|
||||
"LS": {"pos": "punc", "numtype": "ord"},
|
||||
"LS": {"pos": "punct", "numtype": "ord"},
|
||||
"MD": {"pos": "verb", "verbtype": "mod"},
|
||||
"NIL": {},
|
||||
"NIL": {"pos": "no_tag"},
|
||||
"NN": {"pos": "noun", "number": "sing"},
|
||||
"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"},
|
||||
"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"},
|
||||
|
@ -36,7 +37,7 @@
|
|||
"RP": {"pos": "part"},
|
||||
"SYM": {"pos": "sym"},
|
||||
"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"},
|
||||
"UH": {"pos": "int"},
|
||||
"UH": {"pos": "intJ"},
|
||||
"VB": {"pos": "verb", "verbform": "inf"},
|
||||
"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"},
|
||||
"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"},
|
||||
|
@ -47,5 +48,13 @@
|
|||
"WP": {"pos": "noun", "prontype": "int|rel"},
|
||||
"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"},
|
||||
"WRB": {"pos": "adv", "prontype": "int|rel"},
|
||||
"SP": {"pos": "space"}
|
||||
"SP": {"pos": "space"},
|
||||
"ADD": {"pos": "x"},
|
||||
"NFP": {"pos": "punct"},
|
||||
"GW": {"pos": "x"},
|
||||
"AFX": {"pos": "x"},
|
||||
"HYPH": {"pos": "punct"},
|
||||
"XX": {"pos": "x"},
|
||||
"BES": {"pos": "verb"},
|
||||
"HVS": {"pos": "verb"},
|
||||
}
|
||||
|
|
|
@ -91,6 +91,8 @@ cdef class Model:
|
|||
count_feats(counts[guess], feats, n_feats, -cost)
|
||||
self._model.update(counts)
|
||||
|
||||
def end_training(self):
|
||||
def end_training(self, model_loc=None):
|
||||
if model_loc is None:
|
||||
model_loc = self.model_loc
|
||||
self._model.end_training()
|
||||
self._model.dump(self.model_loc, freq_thresh=0)
|
||||
self._model.dump(model_loc, freq_thresh=0)
|
||||
|
|
|
@ -1,5 +1,10 @@
|
|||
from os import path
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .tokenizer import Tokenizer
|
||||
from .morphology import Morphology
|
||||
from .vocab import Vocab
|
||||
|
@ -13,6 +18,8 @@ from . import orth
|
|||
from .syntax.ner import BiluoPushDown
|
||||
from .syntax.arc_eager import ArcEager
|
||||
|
||||
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
|
||||
|
||||
|
||||
class Language(object):
|
||||
@staticmethod
|
||||
|
@ -113,14 +120,6 @@ class Language(object):
|
|||
attrs.IS_OOV: lambda string: True
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def default_dep_templates(cls):
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def default_ner_templates(cls):
|
||||
return []
|
||||
|
||||
@classmethod
|
||||
def default_dep_labels(cls):
|
||||
return {0: {'ROOT': True}}
|
||||
|
@ -186,10 +185,11 @@ class Language(object):
|
|||
return None
|
||||
|
||||
@classmethod
|
||||
def default_matcher(cls, vocab, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = cls.default_data_dir()
|
||||
return Matcher.from_dir(data_dir, vocab)
|
||||
def default_matcher(cls, vocab, data_dir):
|
||||
if path.exists(data_dir):
|
||||
return Matcher.from_dir(data_dir, vocab)
|
||||
else:
|
||||
return None
|
||||
|
||||
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
|
||||
parser=None, entity=None, matcher=None, serializer=None):
|
||||
|
@ -245,9 +245,9 @@ class Language(object):
|
|||
def end_training(self, data_dir=None):
|
||||
if data_dir is None:
|
||||
data_dir = self.data_dir
|
||||
self.parser.model.end_training()
|
||||
self.entity.model.end_training()
|
||||
self.tagger.model.end_training()
|
||||
self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
|
||||
self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
|
||||
self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
|
||||
self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt'))
|
||||
|
||||
with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_:
|
||||
|
|
|
@ -2,29 +2,41 @@ from __future__ import unicode_literals
|
|||
from os import path
|
||||
import codecs
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
except ImportError:
|
||||
import json
|
||||
|
||||
from .parts_of_speech import NOUN, VERB, ADJ
|
||||
|
||||
|
||||
class Lemmatizer(object):
|
||||
def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id):
|
||||
self.noun_id = noun_id
|
||||
self.verb_id = verb_id
|
||||
self.adj_id = adj_id
|
||||
self.index = {}
|
||||
self.exc = {}
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir):
|
||||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos))
|
||||
self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos))
|
||||
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
|
||||
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
|
||||
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
|
||||
return cls(index, exc, rules)
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
self.index = index
|
||||
self.exc = exceptions
|
||||
self.rules = rules
|
||||
|
||||
def __call__(self, string, pos):
|
||||
|
||||
return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
|
||||
if pos == self.noun_id:
|
||||
return self.noun(string)
|
||||
elif pos == self.verb_id:
|
||||
return self.verb(string)
|
||||
elif pos == self.adj_id:
|
||||
return self.adj(string)
|
||||
if pos == NOUN:
|
||||
pos = 'noun'
|
||||
elif pos == VERB:
|
||||
pos = 'verb'
|
||||
elif pos == ADJ:
|
||||
pos = 'adj'
|
||||
else:
|
||||
raise Exception("Cannot lemmatize with unknown pos: %s" % pos)
|
||||
return string
|
||||
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos])
|
||||
return min(lemmas)
|
||||
|
||||
def noun(self, string):
|
||||
return self(string, 'noun')
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
from .structs cimport TokenC
|
||||
from .strings cimport StringStore
|
||||
|
||||
|
||||
cdef class Morphology:
|
||||
cdef readonly object strings
|
||||
cdef public object lemmatizer
|
||||
cdef public object tag_map
|
||||
cdef public object tag_names
|
||||
cdef public object tag_ids
|
||||
cdef public int n_tags
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, int tag) except -1
|
||||
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
|
||||
|
||||
cdef int assign_from_dict(self, TokenC* token, props) except -1
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from os import path
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
try:
|
||||
import ujson as json
|
||||
|
@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES
|
|||
|
||||
|
||||
cdef class Morphology:
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, lemmatizer=None):
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
if lemmatizer is None:
|
||||
lemmatizer = Lemmatizer.from_dir(data_dir)
|
||||
return cls(tag_map, {}, lemmatizer)
|
||||
|
||||
def __init__(self, tag_map, fused_tokens, lemmatizer):
|
||||
self.lemmatizer = lemmatizer
|
||||
self.tag_map = tag_map
|
||||
self.n_tags = len(tag_map)
|
||||
self.tag_names = tuple(sorted(tag_map.keys()))
|
||||
|
@ -17,15 +26,13 @@ cdef class Morphology:
|
|||
for i, tag_str in enumerate(self.tag_names):
|
||||
self.tag_ids[tag_str] = i
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir):
|
||||
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
|
||||
return cls(tag_map, {}, None)
|
||||
|
||||
cdef int assign_tag(self, TokenC* token, int tag) except -1:
|
||||
cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1:
|
||||
# TODO Caching
|
||||
props = self.tag_map[self.tag_names[tag]]
|
||||
token.pos = UNIV_POS_NAMES[props['pos'].upper()]
|
||||
token.tag = tag
|
||||
token.tag = strings[self.tag_names[tag]]
|
||||
lemma = self.lemmatizer(strings[token.lex.orth], token.pos)
|
||||
token.lemma = strings[lemma]
|
||||
#token.inflection = # TODO
|
||||
|
||||
cdef int assign_from_dict(self, TokenC* token, props) except -1:
|
||||
|
|
|
@ -2,17 +2,22 @@
|
|||
cpdef enum univ_pos_t:
|
||||
NO_TAG
|
||||
ADJ
|
||||
ADV
|
||||
ADP
|
||||
ADV
|
||||
AUX
|
||||
CONJ
|
||||
DET
|
||||
INTJ
|
||||
NOUN
|
||||
NUM
|
||||
PART
|
||||
PRON
|
||||
PRT
|
||||
PROPN
|
||||
PUNCT
|
||||
SCONJ
|
||||
SYM
|
||||
VERB
|
||||
X
|
||||
PUNCT
|
||||
EOL
|
||||
SPACE
|
||||
N_UNIV_TAGS
|
||||
|
|
|
@ -4,18 +4,22 @@ from __future__ import unicode_literals
|
|||
UNIV_POS_NAMES = {
|
||||
"NO_TAG": NO_TAG,
|
||||
"ADJ": ADJ,
|
||||
"ADV": ADV,
|
||||
"ADP": ADP,
|
||||
"ADV": ADV,
|
||||
"AUX": AUX,
|
||||
"CONJ": CONJ,
|
||||
"DET": DET,
|
||||
"INTJ": INTJ,
|
||||
"NOUN": NOUN,
|
||||
"NUM": NUM,
|
||||
"PART": PART,
|
||||
"PRON": PRON,
|
||||
"PRT": PRT,
|
||||
"PROPN": PROPN,
|
||||
"PUNCT": PUNCT,
|
||||
"SCONJ": SCONJ,
|
||||
"SYM": SYM,
|
||||
"VERB": VERB,
|
||||
"X": X,
|
||||
"PUNCT": PUNCT,
|
||||
"PUNC": PUNCT,
|
||||
"SPACE": SPACE,
|
||||
"EOL": EOL
|
||||
"EOL": EOL,
|
||||
"SPACE": SPACE
|
||||
}
|
||||
|
|
|
@ -1,26 +1,12 @@
|
|||
from preshed.maps cimport PreshMapArray
|
||||
from preshed.counter cimport PreshCounter
|
||||
from cymem.cymem cimport Pool
|
||||
|
||||
from ._ml cimport Model
|
||||
from .strings cimport StringStore
|
||||
from .structs cimport TokenC, LexemeC
|
||||
from .parts_of_speech cimport univ_pos_t
|
||||
from .structs cimport TokenC
|
||||
from .vocab cimport Vocab
|
||||
|
||||
|
||||
cdef class Tagger:
|
||||
cdef readonly Pool mem
|
||||
cdef readonly StringStore strings
|
||||
cdef readonly Model model
|
||||
cdef readonly Vocab vocab
|
||||
cdef public object lemmatizer
|
||||
cdef PreshMapArray _morph_cache
|
||||
cdef readonly Model model
|
||||
cdef public dict freqs
|
||||
|
||||
cdef readonly int n_tags
|
||||
|
||||
cdef int predict(self, int i, const TokenC* tokens) except -1
|
||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
|
||||
#cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
|
||||
#cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1
|
||||
|
|
|
@ -8,7 +8,7 @@ from .typedefs cimport attr_t
|
|||
from .tokens.doc cimport Doc
|
||||
from .attrs cimport TAG
|
||||
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||
from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
|
||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||
|
||||
from .attrs cimport *
|
||||
from ._ml cimport arg_max
|
||||
|
@ -102,24 +102,10 @@ cdef class Tagger:
|
|||
(P2_flags,),
|
||||
)
|
||||
|
||||
def make_lemmatizer(self):
|
||||
return None
|
||||
|
||||
def __init__(self, Vocab vocab, templates):
|
||||
self.mem = Pool()
|
||||
self.vocab = vocab
|
||||
|
||||
cdef int n_tags = self.vocab.morphology.n_tags + 1
|
||||
|
||||
self.model = Model(n_tags, templates)
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return tuple(sorted(self.vocab.morphology.tag_map.keys()))
|
||||
@classmethod
|
||||
def blank(cls, vocab, templates):
|
||||
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||
return cls(vocab, model)
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, data_dir, vocab):
|
||||
|
@ -127,7 +113,22 @@ cdef class Tagger:
|
|||
templates = json.loads(open(path.join(data_dir, 'templates.json')))
|
||||
else:
|
||||
templates = cls.default_templates()
|
||||
return cls(vocab, templates)
|
||||
model = Model(vocab.morphology.n_tags, templates, data_dir)
|
||||
return cls(vocab, model)
|
||||
|
||||
def __init__(self, Vocab vocab, model):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
||||
# TODO: Move this to tag map
|
||||
self.freqs = {TAG: defaultdict(int)}
|
||||
for tag in self.tag_names:
|
||||
self.freqs[TAG][self.vocab.strings[tag]] = 1
|
||||
self.freqs[TAG][0] = 1
|
||||
|
||||
@property
|
||||
def tag_names(self):
|
||||
return self.vocab.morphology.tag_names
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
@ -142,29 +143,28 @@ cdef class Tagger:
|
|||
for i in range(tokens.length):
|
||||
if tokens.data[i].pos == 0:
|
||||
guess = self.predict(i, tokens.data)
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def tag_from_strings(self, Doc tokens, object tag_strs):
|
||||
cdef int i
|
||||
for i in range(tokens.length):
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
|
||||
tokens.is_tagged = True
|
||||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def train(self, Doc tokens, object gold_tag_strs):
|
||||
assert len(tokens) == len(gold_tag_strs)
|
||||
cdef int i
|
||||
cdef int loss
|
||||
cdef const weight_t* scores
|
||||
golds = [self.tag_names.index(g) if g is not None else -1
|
||||
for g in gold_tag_strs]
|
||||
golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
|
||||
correct = 0
|
||||
for i in range(tokens.length):
|
||||
guess = self.update(i, tokens.data, golds[i])
|
||||
loss = golds[i] != -1 and guess != golds[i]
|
||||
|
||||
self.vocab.morphology.assign_tag(&tokens.data[i], guess)
|
||||
self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
|
||||
correct += loss == 0
|
||||
self.freqs[TAG][tokens.data[i].tag] += 1
|
||||
return correct
|
||||
|
|
Loading…
Reference in New Issue
Block a user