mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop
This commit is contained in:
commit
094440f9f5
|
@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
use_orig_arc_eager=False):
|
use_orig_arc_eager=False):
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
ner_model_dir = path.join(model_dir, 'ner')
|
ner_model_dir = path.join(model_dir, 'ner')
|
||||||
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
if path.exists(dep_model_dir):
|
if path.exists(dep_model_dir):
|
||||||
shutil.rmtree(dep_model_dir)
|
shutil.rmtree(dep_model_dir)
|
||||||
if path.exists(ner_model_dir):
|
if path.exists(ner_model_dir):
|
||||||
shutil.rmtree(ner_model_dir)
|
shutil.rmtree(ner_model_dir)
|
||||||
|
if path.exists(pos_model_dir):
|
||||||
|
shutil.rmtree(pos_model_dir)
|
||||||
os.mkdir(dep_model_dir)
|
os.mkdir(dep_model_dir)
|
||||||
os.mkdir(ner_model_dir)
|
os.mkdir(ner_model_dir)
|
||||||
|
os.mkdir(pos_model_dir)
|
||||||
|
|
||||||
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
|
||||||
labels=ArcEager.get_labels(gold_tuples),
|
labels=ArcEager.get_labels(gold_tuples),
|
||||||
|
@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
|
||||||
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
|
||||||
scorer.tags_acc,
|
scorer.tags_acc,
|
||||||
scorer.token_acc))
|
scorer.token_acc))
|
||||||
|
print('end training')
|
||||||
nlp.end_training(model_dir)
|
nlp.end_training(model_dir)
|
||||||
|
print('done')
|
||||||
|
|
||||||
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
|
||||||
beam_width=None):
|
beam_width=None):
|
||||||
|
|
|
@ -16,9 +16,9 @@ class Lemmatizer(object):
|
||||||
index = {}
|
index = {}
|
||||||
exc = {}
|
exc = {}
|
||||||
for pos in ['adj', 'adv', 'noun', 'verb']:
|
for pos in ['adj', 'adv', 'noun', 'verb']:
|
||||||
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
|
index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos))
|
||||||
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
|
exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos))
|
||||||
rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
|
rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json')))
|
||||||
return cls(index, exc, rules)
|
return cls(index, exc, rules)
|
||||||
|
|
||||||
def __init__(self, index, exceptions, rules):
|
def __init__(self, index, exceptions, rules):
|
||||||
|
@ -33,10 +33,8 @@ class Lemmatizer(object):
|
||||||
pos = 'verb'
|
pos = 'verb'
|
||||||
elif pos == ADJ:
|
elif pos == ADJ:
|
||||||
pos = 'adj'
|
pos = 'adj'
|
||||||
else:
|
|
||||||
return string
|
|
||||||
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
|
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
|
||||||
return min(lemmas)
|
return lemmas
|
||||||
|
|
||||||
def noun(self, string):
|
def noun(self, string):
|
||||||
return self(string, 'noun')
|
return self(string, 'noun')
|
||||||
|
|
|
@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(pattern.length):
|
for i in range(pattern.length):
|
||||||
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
|
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
|
||||||
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
|
|
||||||
print get_token_attr(token, pattern.spec[i].attr)
|
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store):
|
||||||
if isinstance(value, bool):
|
if isinstance(value, bool):
|
||||||
value = int(value)
|
value = int(value)
|
||||||
converted[-1].append((attr, value))
|
converted[-1].append((attr, value))
|
||||||
print "Converted", converted[-1]
|
|
||||||
return converted
|
return converted
|
||||||
|
|
||||||
|
|
||||||
|
@ -175,13 +172,11 @@ cdef class Matcher:
|
||||||
cdef Pattern* state
|
cdef Pattern* state
|
||||||
matches = []
|
matches = []
|
||||||
for token_i in range(doc.length):
|
for token_i in range(doc.length):
|
||||||
print 'check', doc[token_i].orth_
|
|
||||||
token = &doc.data[token_i]
|
token = &doc.data[token_i]
|
||||||
q = 0
|
q = 0
|
||||||
for i in range(partials.size()):
|
for i in range(partials.size()):
|
||||||
state = partials.at(i)
|
state = partials.at(i)
|
||||||
if match(state, token):
|
if match(state, token):
|
||||||
print 'match!'
|
|
||||||
if is_final(state):
|
if is_final(state):
|
||||||
matches.append(get_entity(state, token, token_i))
|
matches.append(get_entity(state, token, token_i))
|
||||||
else:
|
else:
|
||||||
|
@ -191,7 +186,6 @@ cdef class Matcher:
|
||||||
for i in range(self.n_patterns):
|
for i in range(self.n_patterns):
|
||||||
state = self.patterns[i]
|
state = self.patterns[i]
|
||||||
if match(state, token):
|
if match(state, token):
|
||||||
print 'match!'
|
|
||||||
if is_final(state):
|
if is_final(state):
|
||||||
matches.append(get_entity(state, token, token_i))
|
matches.append(get_entity(state, token, token_i))
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -43,6 +43,7 @@ cdef class Morphology:
|
||||||
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
|
||||||
analysis.tag = self.rich_tags[tag_id]
|
analysis.tag = self.rich_tags[tag_id]
|
||||||
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
|
||||||
|
self._cache.set(tag_id, token.lex.orth, analysis)
|
||||||
token.lemma = analysis.lemma
|
token.lemma = analysis.lemma
|
||||||
token.pos = analysis.tag.pos
|
token.pos = analysis.tag.pos
|
||||||
token.tag = analysis.tag.name
|
token.tag = analysis.tag.name
|
||||||
|
|
|
@ -16,12 +16,11 @@ cdef class StateClass:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(length + (PADDING * 2)):
|
for i in range(length + (PADDING * 2)):
|
||||||
self._ents[i].end = -1
|
self._ents[i].end = -1
|
||||||
|
self._sent[i].l_edge = i
|
||||||
|
self._sent[i].r_edge = i
|
||||||
for i in range(length, length + (PADDING * 2)):
|
for i in range(length, length + (PADDING * 2)):
|
||||||
self._sent[i].lex = &EMPTY_LEXEME
|
self._sent[i].lex = &EMPTY_LEXEME
|
||||||
self._sent += PADDING
|
self._sent += PADDING
|
||||||
for i in range(length):
|
|
||||||
self._sent[i].l_edge = i
|
|
||||||
self._sent[i].r_edge = i
|
|
||||||
self._ents += PADDING
|
self._ents += PADDING
|
||||||
self._buffer += PADDING
|
self._buffer += PADDING
|
||||||
self._stack += PADDING
|
self._stack += PADDING
|
||||||
|
@ -162,11 +161,11 @@ cdef class StateClass:
|
||||||
cdef int dist = h_i - c_i
|
cdef int dist = h_i - c_i
|
||||||
cdef TokenC* h = &self._sent[h_i]
|
cdef TokenC* h = &self._sent[h_i]
|
||||||
if c_i > h_i:
|
if c_i > h_i:
|
||||||
|
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
|
||||||
h.r_kids -= 1
|
h.r_kids -= 1
|
||||||
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i
|
|
||||||
else:
|
else:
|
||||||
|
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
|
||||||
h.l_kids -= 1
|
h.l_kids -= 1
|
||||||
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i
|
|
||||||
|
|
||||||
cdef void open_ent(self, int label) nogil:
|
cdef void open_ent(self, int label) nogil:
|
||||||
self._ents[self._e_i].start = self.B(0)
|
self._ents[self._e_i].start = self.B(0)
|
||||||
|
|
|
@ -67,6 +67,8 @@ cdef class Doc:
|
||||||
cdef int i
|
cdef int i
|
||||||
for i in range(size + (PADDING*2)):
|
for i in range(size + (PADDING*2)):
|
||||||
data_start[i].lex = &EMPTY_LEXEME
|
data_start[i].lex = &EMPTY_LEXEME
|
||||||
|
data_start[i].l_edge = i
|
||||||
|
data_start[i].r_edge = i
|
||||||
self.data = data_start + PADDING
|
self.data = data_start + PADDING
|
||||||
self.max_length = size
|
self.max_length = size
|
||||||
self.length = 0
|
self.length = 0
|
||||||
|
@ -219,6 +221,8 @@ cdef class Doc:
|
||||||
t.idx = 0
|
t.idx = 0
|
||||||
else:
|
else:
|
||||||
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||||
|
t.l_edge = self.length
|
||||||
|
t.r_edge = self.length
|
||||||
assert t.lex.orth != 0
|
assert t.lex.orth != 0
|
||||||
t.spacy = has_space
|
t.spacy = has_space
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
@ -310,6 +314,8 @@ cdef class Doc:
|
||||||
self.is_parsed = True
|
self.is_parsed = True
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
self.data[i] = parsed[i]
|
self.data[i] = parsed[i]
|
||||||
|
assert self.data[i].l_edge <= i
|
||||||
|
assert self.data[i].r_edge >= i
|
||||||
|
|
||||||
def from_array(self, attrs, array):
|
def from_array(self, attrs, array):
|
||||||
cdef int i, col
|
cdef int i, col
|
||||||
|
@ -396,7 +402,7 @@ cdef class Doc:
|
||||||
cdef TokenC* token = &self.data[start]
|
cdef TokenC* token = &self.data[start]
|
||||||
# Update fields
|
# Update fields
|
||||||
token.lex = lex
|
token.lex = lex
|
||||||
token.spacy = self.data[end].spacy
|
token.spacy = self.data[end-1].spacy
|
||||||
# What to do about morphology??
|
# What to do about morphology??
|
||||||
# TODO: token.morph = ???
|
# TODO: token.morph = ???
|
||||||
token.tag = self.vocab.strings[tag]
|
token.tag = self.vocab.strings[tag]
|
||||||
|
|
|
@ -117,7 +117,9 @@ cdef class Vocab:
|
||||||
|
|
||||||
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef bint is_oov = mem is not self.mem
|
#cdef bint is_oov = mem is not self.mem
|
||||||
|
# TODO
|
||||||
|
is_oov = False
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
if len(string) < 3:
|
if len(string) < 3:
|
||||||
mem = self.mem
|
mem = self.mem
|
||||||
|
@ -224,19 +226,17 @@ cdef class Vocab:
|
||||||
raise IOError('LexemeCs file not found at %s' % loc)
|
raise IOError('LexemeCs file not found at %s' % loc)
|
||||||
fp = CFile(loc, 'rb')
|
fp = CFile(loc, 'rb')
|
||||||
cdef LexemeC* lexeme
|
cdef LexemeC* lexeme
|
||||||
cdef attr_t orth
|
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef unicode py_str
|
cdef unicode py_str
|
||||||
cdef uint64_t bad_bytes
|
cdef attr_t orth
|
||||||
|
assert sizeof(orth) == sizeof(lexeme.orth)
|
||||||
i = 0
|
i = 0
|
||||||
while True:
|
while True:
|
||||||
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
|
||||||
try:
|
try:
|
||||||
fp.read_into(&orth, 1, sizeof(orth))
|
fp.read_into(&orth, 1, sizeof(orth))
|
||||||
except IOError:
|
except IOError:
|
||||||
break
|
break
|
||||||
# This 64 bit chunk is there for backwards compatibility. Remove on next release.
|
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
|
||||||
fp.read_into(&bad_bytes, 1, sizeof(bad_bytes))
|
|
||||||
# Copy data from the file into the lexeme
|
# Copy data from the file into the lexeme
|
||||||
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
|
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
|
||||||
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
|
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
|
||||||
|
@ -253,10 +253,8 @@ cdef class Vocab:
|
||||||
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
|
||||||
|
|
||||||
lexeme.repvec = EMPTY_VEC
|
lexeme.repvec = EMPTY_VEC
|
||||||
if orth != lexeme.orth:
|
py_str = self.strings[lexeme.orth]
|
||||||
# TODO: Improve this error message, pending resolution to Issue #64
|
assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix)
|
||||||
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
|
|
||||||
py_str = self.strings[orth]
|
|
||||||
key = hash_string(py_str)
|
key = hash_string(py_str)
|
||||||
self._by_hash.set(key, lexeme)
|
self._by_hash.set(key, lexeme)
|
||||||
self._by_orth.set(lexeme.orth, lexeme)
|
self._by_orth.set(lexeme.orth, lexeme)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
|
from spacy.lemmatizer import Lemmatizer, read_index, read_exc
|
||||||
from spacy.en import LOCAL_DATA_DIR
|
from spacy.en import LOCAL_DATA_DIR
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ def test_read_exc():
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
|
return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
|
||||||
|
|
||||||
|
|
||||||
def test_noun_lemmas(lemmatizer):
|
def test_noun_lemmas(lemmatizer):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user