mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop
This commit is contained in:
commit
094440f9f5
|
@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
use_orig_arc_eager=False):
|
||||
dep_model_dir = path.join(model_dir, 'deps')
|
||||
ner_model_dir = path.join(model_dir, 'ner')
|
||||
pos_model_dir = path.join(model_dir, 'pos')
|
||||
if path.exists(dep_model_dir):
|
||||
shutil.rmtree(dep_model_dir)
|
||||
if path.exists(ner_model_dir):
|
||||
shutil.rmtree(ner_model_dir)
|
||||
if path.exists(pos_model_dir):
|
||||
shutil.rmtree(pos_model_dir)
|
||||
os.mkdir(dep_model_dir)
|
||||
os.mkdir(ner_model_dir)
|
||||
os.mkdir(pos_model_dir)
|
||||
|
||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||
labels=ArcEager.get_labels(gold_tuples),
|
||||
|
@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
|||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||
scorer.tags_acc,
|
||||
scorer.token_acc))
|
||||
print('end training')
|
||||
nlp.end_training(model_dir)
|
||||
print('done')
|
||||
|
||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||
beam_width=None):
|
||||
|
|
|
@ -16,9 +16,9 @@ class Lemmatizer(object):
|
|||
index = {}
|
||||
exc = {}
|
||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
|
||||
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
|
||||
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
|
||||
index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos))
|
||||
exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos))
|
||||
rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json')))
|
||||
return cls(index, exc, rules)
|
||||
|
||||
def __init__(self, index, exceptions, rules):
|
||||
|
@ -33,10 +33,8 @@ class Lemmatizer(object):
|
|||
pos = 'verb'
|
||||
elif pos == ADJ:
|
||||
pos = 'adj'
|
||||
else:
|
||||
return string
|
||||
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
|
||||
return min(lemmas)
|
||||
return lemmas
|
||||
|
||||
def noun(self, string):
|
||||
return self(string, 'noun')
|
||||
|
|
|
@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
|
|||
cdef int i
|
||||
for i in range(pattern.length):
|
||||
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
|
||||
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
|
||||
print get_token_attr(token, pattern.spec[i].attr)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store):
|
|||
if isinstance(value, bool):
|
||||
value = int(value)
|
||||
converted[-1].append((attr, value))
|
||||
print "Converted", converted[-1]
|
||||
return converted
|
||||
|
||||
|
||||
|
@ -175,13 +172,11 @@ cdef class Matcher:
|
|||
cdef Pattern* state
|
||||
matches = []
|
||||
for token_i in range(doc.length):
|
||||
print 'check', doc[token_i].orth_
|
||||
token = &doc.data[token_i]
|
||||
q = 0
|
||||
for i in range(partials.size()):
|
||||
state = partials.at(i)
|
||||
if match(state, token):
|
||||
print 'match!'
|
||||
if is_final(state):
|
||||
matches.append(get_entity(state, token, token_i))
|
||||
else:
|
||||
|
@ -191,7 +186,6 @@ cdef class Matcher:
|
|||
for i in range(self.n_patterns):
|
||||
state = self.patterns[i]
|
||||
if match(state, token):
|
||||
print 'match!'
|
||||
if is_final(state):
|
||||
matches.append(get_entity(state, token, token_i))
|
||||
else:
|
||||
|
|
|
@ -43,6 +43,7 @@ cdef class Morphology:
|
|||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||
analysis.tag = self.rich_tags[tag_id]
|
||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
||||
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||
token.lemma = analysis.lemma
|
||||
token.pos = analysis.tag.pos
|
||||
token.tag = analysis.tag.name
|
||||
|
|
|
@ -16,12 +16,11 @@ cdef class StateClass:
|
|||
cdef int i
|
||||
for i in range(length + (PADDING * 2)):
|
||||
self._ents[i].end = -1
|
||||
self._sent[i].l_edge = i
|
||||
self._sent[i].r_edge = i
|
||||
for i in range(length, length + (PADDING * 2)):
|
||||
self._sent[i].lex = &EMPTY_LEXEME
|
||||
self._sent += PADDING
|
||||
for i in range(length):
|
||||
self._sent[i].l_edge = i
|
||||
self._sent[i].r_edge = i
|
||||
self._ents += PADDING
|
||||
self._buffer += PADDING
|
||||
self._stack += PADDING
|
||||
|
@ -162,11 +161,11 @@ cdef class StateClass:
|
|||
cdef int dist = h_i - c_i
|
||||
cdef TokenC* h = &self._sent[h_i]
|
||||
if c_i > h_i:
|
||||
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
|
||||
h.r_kids -= 1
|
||||
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i
|
||||
else:
|
||||
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
||||
h.l_kids -= 1
|
||||
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i
|
||||
|
||||
cdef void open_ent(self, int label) nogil:
|
||||
self._ents[self._e_i].start = self.B(0)
|
||||
|
|
|
@ -67,6 +67,8 @@ cdef class Doc:
|
|||
cdef int i
|
||||
for i in range(size + (PADDING*2)):
|
||||
data_start[i].lex = &EMPTY_LEXEME
|
||||
data_start[i].l_edge = i
|
||||
data_start[i].r_edge = i
|
||||
self.data = data_start + PADDING
|
||||
self.max_length = size
|
||||
self.length = 0
|
||||
|
@ -219,6 +221,8 @@ cdef class Doc:
|
|||
t.idx = 0
|
||||
else:
|
||||
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||
t.l_edge = self.length
|
||||
t.r_edge = self.length
|
||||
assert t.lex.orth != 0
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
|
@ -310,6 +314,8 @@ cdef class Doc:
|
|||
self.is_parsed = True
|
||||
for i in range(self.length):
|
||||
self.data[i] = parsed[i]
|
||||
assert self.data[i].l_edge <= i
|
||||
assert self.data[i].r_edge >= i
|
||||
|
||||
def from_array(self, attrs, array):
|
||||
cdef int i, col
|
||||
|
@ -396,7 +402,7 @@ cdef class Doc:
|
|||
cdef TokenC* token = &self.data[start]
|
||||
# Update fields
|
||||
token.lex = lex
|
||||
token.spacy = self.data[end].spacy
|
||||
token.spacy = self.data[end-1].spacy
|
||||
# What to do about morphology??
|
||||
# TODO: token.morph = ???
|
||||
token.tag = self.vocab.strings[tag]
|
||||
|
|
|
@ -117,7 +117,9 @@ cdef class Vocab:
|
|||
|
||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||
cdef hash_t key
|
||||
cdef bint is_oov = mem is not self.mem
|
||||
#cdef bint is_oov = mem is not self.mem
|
||||
# TODO
|
||||
is_oov = False
|
||||
mem = self.mem
|
||||
if len(string) < 3:
|
||||
mem = self.mem
|
||||
|
@ -224,19 +226,17 @@ cdef class Vocab:
|
|||
raise IOError('LexemeCs file not found at %s' % loc)
|
||||
fp = CFile(loc, 'rb')
|
||||
cdef LexemeC* lexeme
|
||||
cdef attr_t orth
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
cdef uint64_t bad_bytes
|
||||
cdef attr_t orth
|
||||
assert sizeof(orth) == sizeof(lexeme.orth)
|
||||
i = 0
|
||||
while True:
|
||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
try:
|
||||
fp.read_into(&orth, 1, sizeof(orth))
|
||||
except IOError:
|
||||
break
|
||||
# This 64 bit chunk is there for backwards compatibility. Remove on next release.
|
||||
fp.read_into(&bad_bytes, 1, sizeof(bad_bytes))
|
||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||
# Copy data from the file into the lexeme
|
||||
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
|
||||
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
|
||||
|
@ -253,10 +253,8 @@ cdef class Vocab:
|
|||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||
|
||||
lexeme.repvec = EMPTY_VEC
|
||||
if orth != lexeme.orth:
|
||||
# TODO: Improve this error message, pending resolution to Issue #64
|
||||
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
||||
py_str = self.strings[orth]
|
||||
py_str = self.strings[lexeme.orth]
|
||||
assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix)
|
||||
key = hash_string(py_str)
|
||||
self._by_hash.set(key, lexeme)
|
||||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
|
||||
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||
from spacy.en import LOCAL_DATA_DIR
|
||||
from os import path
|
||||
|
||||
|
@ -23,7 +23,7 @@ def test_read_exc():
|
|||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
|
||||
return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
|
||||
|
||||
|
||||
def test_noun_lemmas(lemmatizer):
|
||||
|
|
Loading…
Reference in New Issue
Block a user