Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop

This commit is contained in:
Matthew Honnibal 2015-09-10 14:51:17 +02:00
commit 094440f9f5
8 changed files with 32 additions and 30 deletions

View File

@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
use_orig_arc_eager=False): use_orig_arc_eager=False):
dep_model_dir = path.join(model_dir, 'deps') dep_model_dir = path.join(model_dir, 'deps')
ner_model_dir = path.join(model_dir, 'ner') ner_model_dir = path.join(model_dir, 'ner')
pos_model_dir = path.join(model_dir, 'pos')
if path.exists(dep_model_dir): if path.exists(dep_model_dir):
shutil.rmtree(dep_model_dir) shutil.rmtree(dep_model_dir)
if path.exists(ner_model_dir): if path.exists(ner_model_dir):
shutil.rmtree(ner_model_dir) shutil.rmtree(ner_model_dir)
if path.exists(pos_model_dir):
shutil.rmtree(pos_model_dir)
os.mkdir(dep_model_dir) os.mkdir(dep_model_dir)
os.mkdir(ner_model_dir) os.mkdir(ner_model_dir)
os.mkdir(pos_model_dir)
Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
labels=ArcEager.get_labels(gold_tuples), labels=ArcEager.get_labels(gold_tuples),
@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
scorer.tags_acc, scorer.tags_acc,
scorer.token_acc)) scorer.token_acc))
print('end training')
nlp.end_training(model_dir) nlp.end_training(model_dir)
print('done')
def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
beam_width=None): beam_width=None):

View File

@ -16,9 +16,9 @@ class Lemmatizer(object):
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'adv', 'noun', 'verb']: for pos in ['adj', 'adv', 'noun', 'verb']:
index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos))
exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos))
rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json')))
return cls(index, exc, rules) return cls(index, exc, rules)
def __init__(self, index, exceptions, rules): def __init__(self, index, exceptions, rules):
@ -33,10 +33,8 @@ class Lemmatizer(object):
pos = 'verb' pos = 'verb'
elif pos == ADJ: elif pos == ADJ:
pos = 'adj' pos = 'adj'
else:
return string
lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
return min(lemmas) return lemmas
def noun(self, string): def noun(self, string):
return self(string, 'noun') return self(string, 'noun')

View File

@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
cdef int i cdef int i
for i in range(pattern.length): for i in range(pattern.length):
if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
print get_token_attr(token, pattern.spec[i].attr)
return False return False
return True return True
@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store):
if isinstance(value, bool): if isinstance(value, bool):
value = int(value) value = int(value)
converted[-1].append((attr, value)) converted[-1].append((attr, value))
print "Converted", converted[-1]
return converted return converted
@ -175,13 +172,11 @@ cdef class Matcher:
cdef Pattern* state cdef Pattern* state
matches = [] matches = []
for token_i in range(doc.length): for token_i in range(doc.length):
print 'check', doc[token_i].orth_
token = &doc.data[token_i] token = &doc.data[token_i]
q = 0 q = 0
for i in range(partials.size()): for i in range(partials.size()):
state = partials.at(i) state = partials.at(i)
if match(state, token): if match(state, token):
print 'match!'
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) matches.append(get_entity(state, token, token_i))
else: else:
@ -191,7 +186,6 @@ cdef class Matcher:
for i in range(self.n_patterns): for i in range(self.n_patterns):
state = self.patterns[i] state = self.patterns[i]
if match(state, token): if match(state, token):
print 'match!'
if is_final(state): if is_final(state):
matches.append(get_entity(state, token, token_i)) matches.append(get_entity(state, token, token_i))
else: else:

View File

@ -43,6 +43,7 @@ cdef class Morphology:
analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
analysis.tag = self.rich_tags[tag_id] analysis.tag = self.rich_tags[tag_id]
analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
self._cache.set(tag_id, token.lex.orth, analysis)
token.lemma = analysis.lemma token.lemma = analysis.lemma
token.pos = analysis.tag.pos token.pos = analysis.tag.pos
token.tag = analysis.tag.name token.tag = analysis.tag.name

View File

@ -16,12 +16,11 @@ cdef class StateClass:
cdef int i cdef int i
for i in range(length + (PADDING * 2)): for i in range(length + (PADDING * 2)):
self._ents[i].end = -1 self._ents[i].end = -1
self._sent[i].l_edge = i
self._sent[i].r_edge = i
for i in range(length, length + (PADDING * 2)): for i in range(length, length + (PADDING * 2)):
self._sent[i].lex = &EMPTY_LEXEME self._sent[i].lex = &EMPTY_LEXEME
self._sent += PADDING self._sent += PADDING
for i in range(length):
self._sent[i].l_edge = i
self._sent[i].r_edge = i
self._ents += PADDING self._ents += PADDING
self._buffer += PADDING self._buffer += PADDING
self._stack += PADDING self._stack += PADDING
@ -162,11 +161,11 @@ cdef class StateClass:
cdef int dist = h_i - c_i cdef int dist = h_i - c_i
cdef TokenC* h = &self._sent[h_i] cdef TokenC* h = &self._sent[h_i]
if c_i > h_i: if c_i > h_i:
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
h.r_kids -= 1 h.r_kids -= 1
h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i
else: else:
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
h.l_kids -= 1 h.l_kids -= 1
h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i
cdef void open_ent(self, int label) nogil: cdef void open_ent(self, int label) nogil:
self._ents[self._e_i].start = self.B(0) self._ents[self._e_i].start = self.B(0)

View File

@ -67,6 +67,8 @@ cdef class Doc:
cdef int i cdef int i
for i in range(size + (PADDING*2)): for i in range(size + (PADDING*2)):
data_start[i].lex = &EMPTY_LEXEME data_start[i].lex = &EMPTY_LEXEME
data_start[i].l_edge = i
data_start[i].r_edge = i
self.data = data_start + PADDING self.data = data_start + PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
@ -219,6 +221,8 @@ cdef class Doc:
t.idx = 0 t.idx = 0
else: else:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.l_edge = self.length
t.r_edge = self.length
assert t.lex.orth != 0 assert t.lex.orth != 0
t.spacy = has_space t.spacy = has_space
self.length += 1 self.length += 1
@ -310,6 +314,8 @@ cdef class Doc:
self.is_parsed = True self.is_parsed = True
for i in range(self.length): for i in range(self.length):
self.data[i] = parsed[i] self.data[i] = parsed[i]
assert self.data[i].l_edge <= i
assert self.data[i].r_edge >= i
def from_array(self, attrs, array): def from_array(self, attrs, array):
cdef int i, col cdef int i, col
@ -396,7 +402,7 @@ cdef class Doc:
cdef TokenC* token = &self.data[start] cdef TokenC* token = &self.data[start]
# Update fields # Update fields
token.lex = lex token.lex = lex
token.spacy = self.data[end].spacy token.spacy = self.data[end-1].spacy
# What to do about morphology?? # What to do about morphology??
# TODO: token.morph = ??? # TODO: token.morph = ???
token.tag = self.vocab.strings[tag] token.tag = self.vocab.strings[tag]

View File

@ -117,7 +117,9 @@ cdef class Vocab:
cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
cdef hash_t key cdef hash_t key
cdef bint is_oov = mem is not self.mem #cdef bint is_oov = mem is not self.mem
# TODO
is_oov = False
mem = self.mem mem = self.mem
if len(string) < 3: if len(string) < 3:
mem = self.mem mem = self.mem
@ -224,19 +226,17 @@ cdef class Vocab:
raise IOError('LexemeCs file not found at %s' % loc) raise IOError('LexemeCs file not found at %s' % loc)
fp = CFile(loc, 'rb') fp = CFile(loc, 'rb')
cdef LexemeC* lexeme cdef LexemeC* lexeme
cdef attr_t orth
cdef hash_t key cdef hash_t key
cdef unicode py_str cdef unicode py_str
cdef uint64_t bad_bytes cdef attr_t orth
assert sizeof(orth) == sizeof(lexeme.orth)
i = 0 i = 0
while True: while True:
lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
try: try:
fp.read_into(&orth, 1, sizeof(orth)) fp.read_into(&orth, 1, sizeof(orth))
except IOError: except IOError:
break break
# This 64 bit chunk is there for backwards compatibility. Remove on next release. lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
fp.read_into(&bad_bytes, 1, sizeof(bad_bytes))
# Copy data from the file into the lexeme # Copy data from the file into the lexeme
fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
@ -253,10 +253,8 @@ cdef class Vocab:
fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
lexeme.repvec = EMPTY_VEC lexeme.repvec = EMPTY_VEC
if orth != lexeme.orth: py_str = self.strings[lexeme.orth]
# TODO: Improve this error message, pending resolution to Issue #64 assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix)
raise IOError('Error reading from lexemes.bin. Integrity check fails.')
py_str = self.strings[orth]
key = hash_string(py_str) key = hash_string(py_str)
self._by_hash.set(key, lexeme) self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme) self._by_orth.set(lexeme.orth, lexeme)

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.en import LOCAL_DATA_DIR from spacy.en import LOCAL_DATA_DIR
from os import path from os import path
@ -23,7 +23,7 @@ def test_read_exc():
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0) return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
def test_noun_lemmas(lemmatizer): def test_noun_lemmas(lemmatizer):