Merge branch 'develop' of ssh://github.com/honnibal/spaCy into develop

2025-10-21 11:14:32 +03:00 · 2015-09-10 14:51:17 +02:00 · 2015-09-10 14:51:17 +02:00 · 094440f9f5
commit 094440f9f5
parent c3f773cd63 e7e529edf4
8 changed files with 32 additions and 30 deletions
--- a/bin/parser/train.py
+++ b/bin/parser/train.py
@ -85,12 +85,16 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
          use_orig_arc_eager=False):
    dep_model_dir = path.join(model_dir, 'deps')
    ner_model_dir = path.join(model_dir, 'ner')
    pos_model_dir = path.join(model_dir, 'pos')
    if path.exists(dep_model_dir):
        shutil.rmtree(dep_model_dir)
    if path.exists(ner_model_dir):
        shutil.rmtree(ner_model_dir)
    if path.exists(pos_model_dir):
        shutil.rmtree(pos_model_dir)
    os.mkdir(dep_model_dir)
    os.mkdir(ner_model_dir)
    os.mkdir(pos_model_dir)
    Config.write(dep_model_dir, 'config', features=feat_set, seed=seed,
                 labels=ArcEager.get_labels(gold_tuples),
@ -140,7 +144,9 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic',
        print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f,
                                                   scorer.tags_acc,
                                                   scorer.token_acc))
    print('end training')
    nlp.end_training(model_dir)
    print('done')
 def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False,
             beam_width=None):
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -16,9 +16,9 @@ class Lemmatizer(object):
        index = {}
        exc = {}
        for pos in ['adj', 'adv', 'noun', 'verb']:
-            index[pos] = read_index(path.join(data_dir, 'index.%s' % pos))
+            index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos))
-            exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos))
+            exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos))
-        rules = json.load(open(path.join(data_dir, 'lemma_rules.json')))
+        rules = json.load(open(path.join(data_dir, 'vocab', 'lemma_rules.json')))
        return cls(index, exc, rules)
    def __init__(self, index, exceptions, rules):
@ -33,10 +33,8 @@ class Lemmatizer(object):
            pos = 'verb'
        elif pos == ADJ:
            pos = 'adj'
        else:
            return string
        lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, []))
-        return min(lemmas)
+        return lemmas
    def noun(self, string):
        return self(string, 'noun')
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@ -54,8 +54,6 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1:
    cdef int i
    for i in range(pattern.length):
        if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value:
            print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value
            print get_token_attr(token, pattern.spec[i].attr)
            return False
    return True
@ -82,7 +80,6 @@ def _convert_strings(token_specs, string_store):
            if isinstance(value, bool):
                value = int(value)
            converted[-1].append((attr, value))
            print "Converted", converted[-1]
    return converted
@ -175,13 +172,11 @@ cdef class Matcher:
        cdef Pattern* state
        matches = []
        for token_i in range(doc.length):
            print 'check', doc[token_i].orth_
            token = &doc.data[token_i]
            q = 0
            for i in range(partials.size()):
                state = partials.at(i)
                if match(state, token):
                    print 'match!'
                    if is_final(state):
                        matches.append(get_entity(state, token, token_i))
                    else:
@ -191,7 +186,6 @@ cdef class Matcher:
            for i in range(self.n_patterns):
                state = self.patterns[i]
                if match(state, token):
                    print 'match!'
                    if is_final(state):
                        matches.append(get_entity(state, token, token_i))
                    else:
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -43,6 +43,7 @@ cdef class Morphology:
            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
            analysis.tag = self.rich_tags[tag_id]
            analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth)
            self._cache.set(tag_id, token.lex.orth, analysis)
        token.lemma = analysis.lemma
        token.pos = analysis.tag.pos
        token.tag = analysis.tag.name
--- a/spacy/syntax/stateclass.pyx
+++ b/spacy/syntax/stateclass.pyx
@ -16,12 +16,11 @@ cdef class StateClass:
        cdef int i
        for i in range(length + (PADDING * 2)):
            self._ents[i].end = -1
            self._sent[i].l_edge = i
            self._sent[i].r_edge = i
        for i in range(length, length + (PADDING * 2)):
            self._sent[i].lex = &EMPTY_LEXEME
        self._sent += PADDING
        for i in range(length):
            self._sent[i].l_edge = i
            self._sent[i].r_edge = i
        self._ents += PADDING
        self._buffer += PADDING
        self._stack += PADDING
@ -162,11 +161,11 @@ cdef class StateClass:
        cdef int dist = h_i - c_i
        cdef TokenC* h = &self._sent[h_i]
        if c_i > h_i:
            h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 2 else h_i
            h.r_kids -= 1
            h.r_edge = self.R_(h_i, 2).r_edge if h.r_kids >= 1 else h_i
        else:
            h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 2 else h_i
            h.l_kids -= 1
            h.l_edge = self.L_(h_i, 2).l_edge if h.l_kids >= 1 else h_i
    cdef void open_ent(self, int label) nogil:
        self._ents[self._e_i].start = self.B(0)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -67,6 +67,8 @@ cdef class Doc:
        cdef int i
        for i in range(size + (PADDING*2)):
            data_start[i].lex = &EMPTY_LEXEME
            data_start[i].l_edge = i
            data_start[i].r_edge = i
        self.data = data_start + PADDING
        self.max_length = size
        self.length = 0
@ -219,6 +221,8 @@ cdef class Doc:
            t.idx = 0
        else:
            t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
        t.l_edge = self.length
        t.r_edge = self.length
        assert t.lex.orth != 0
        t.spacy = has_space
        self.length += 1
@ -310,6 +314,8 @@ cdef class Doc:
        self.is_parsed = True
        for i in range(self.length):
            self.data[i] = parsed[i]
            assert self.data[i].l_edge <= i
            assert self.data[i].r_edge >= i
    def from_array(self, attrs, array):
        cdef int i, col
@ -396,7 +402,7 @@ cdef class Doc:
        cdef TokenC* token = &self.data[start]
        # Update fields
        token.lex = lex
-        token.spacy = self.data[end].spacy
+        token.spacy = self.data[end-1].spacy
        # What to do about morphology??
        # TODO: token.morph = ???
        token.tag = self.vocab.strings[tag]
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -117,7 +117,9 @@ cdef class Vocab:
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL:
        cdef hash_t key
-        cdef bint is_oov = mem is not self.mem
+        #cdef bint is_oov = mem is not self.mem
        # TODO
        is_oov = False
        mem = self.mem
        if len(string) < 3:
            mem = self.mem
@ -224,19 +226,17 @@ cdef class Vocab:
            raise IOError('LexemeCs file not found at %s' % loc)
        fp = CFile(loc, 'rb')
        cdef LexemeC* lexeme
        cdef attr_t orth
        cdef hash_t key
        cdef unicode py_str
-        cdef uint64_t bad_bytes
+        cdef attr_t orth
        assert sizeof(orth) == sizeof(lexeme.orth)
        i = 0
        while True:
            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
            try:
                fp.read_into(&orth, 1, sizeof(orth))
            except IOError:
                break
-            # This 64 bit chunk is there for backwards compatibility. Remove on next release.
+            lexeme = <LexemeC*>self.mem.alloc(sizeof(LexemeC), 1)
            fp.read_into(&bad_bytes, 1, sizeof(bad_bytes))
            # Copy data from the file into the lexeme
            fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags))
            fp.read_into(&lexeme.id, 1, sizeof(lexeme.id))
@ -253,10 +253,8 @@ cdef class Vocab:
            fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm))
            lexeme.repvec = EMPTY_VEC
-            if orth != lexeme.orth:
+            py_str = self.strings[lexeme.orth]
-                # TODO: Improve this error message, pending resolution to Issue #64
+            assert py_str[-3:] == self.strings[lexeme.suffix], "%s (%d) suffix %s (%d)" % (repr(py_str), lexeme.orth, repr(self.strings[lexeme.suffix]), lexeme.suffix)
                raise IOError('Error reading from lexemes.bin. Integrity check fails.')
            py_str = self.strings[orth]
            key = hash_string(py_str)
            self._by_hash.set(key, lexeme)
            self._by_orth.set(lexeme.orth, lexeme)
--- a/tests/tagger/test_lemmatizer.py
+++ b/tests/tagger/test_lemmatizer.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals
-from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
+from spacy.lemmatizer import Lemmatizer, read_index, read_exc
 from spacy.en import LOCAL_DATA_DIR
 from os import path
@ -23,7 +23,7 @@ def test_read_exc():
@pytest.fixture
 def lemmatizer():
-    return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
+    return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
 def test_noun_lemmas(lemmatizer):