From 9c1724ecaef838a9c510b7b2b42d48e4ec5ccad6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 00:35:40 +0200 Subject: [PATCH] * Gazetteer stuff working, now need to wire up to API --- spacy/matcher.pyx | 1 + spacy/syntax/ner.pyx | 40 ++++++++++++++- spacy/syntax/stateclass.pxd | 2 +- spacy/tokens/doc.pxd | 5 ++ spacy/tokens/doc.pyx | 91 +++++++++++++++++++++------------ tests/spans/test_merge.py | 19 ++++++- tests/test_matcher.py | 13 ++++- tests/tokens/test_tokens_api.py | 13 ++++- 8 files changed, 146 insertions(+), 38 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index f6f3f95d6..b1b77e162 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -97,4 +97,5 @@ cdef class Matcher: matches.append(get_entity(state, token, token_i)) else: partials.push_back(state + 1) + doc.ents = list(sorted(list(doc.ents) + matches)) return matches diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index fbd580b29..8fa4a03d5 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -160,7 +160,17 @@ cdef class Missing: cdef class Begin: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return label != 0 and not st.entity_is_open() + # Ensure we don't clobber preset entities. If no entity preset, + # ent_iob is 0 + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 1: + return False + elif preset_ent_iob == 2: + return False + elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + return False + else: + return label != 0 and not st.entity_is_open() @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -190,6 +200,14 @@ cdef class Begin: cdef class In: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + # TODO: Is this quite right? + elif st.B_(1).ent_iob != preset_ent_iob: + return False return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -230,6 +248,14 @@ cdef class In: cdef class Last: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + elif st.B_(1).ent_iob == 1: + return False + return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -269,6 +295,13 @@ cdef class Last: cdef class Unit: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 1: + return False + elif st.B_(1).ent_iob == 1: + return False return label != 0 and not st.entity_is_open() @staticmethod @@ -300,6 +333,11 @@ cdef class Unit: cdef class Out: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 3: + return False + elif preset_ent_iob == 1: + return False return not st.entity_is_open() @staticmethod diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 905d8cdde..888b01c32 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -125,7 +125,7 @@ cdef class StateClass: cdef void add_arc(self, int head, int child, int label) nogil cdef void del_arc(self, int head, int child) nogil - + cdef void open_ent(self, int label) nogil cdef void close_ent(self) nogil diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7de5e0bea..121018770 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -4,6 +4,11 @@ from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC +from ..typedefs cimport attr_t +from ..attrs cimport attr_id_t + + +cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a3ae45733..6d0cd9a8b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -119,40 +119,67 @@ cdef class Doc: def string(self): return u''.join([t.string for t in self]) - @property - def ents(self): - """Yields named-entity Span objects. + property ents: + def __get__(self): + """Yields named-entity Span objects. - Iterate over the span to get individual Token objects, or access the label: + Iterate over the span to get individual Token objects, or access the label: - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') - >>> ents = list(tokens.ents) - >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) - (112504, u'PERSON', u'Best ') - """ - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef int label = 0 - for i in range(self.length): - token = &self.data[i] - if token.ent_iob == 1: - assert start != -1 - pass - elif token.ent_iob == 2: - if start != -1: - yield Span(self, start, i, label=label) - start = -1 - label = 0 - elif token.ent_iob == 3: - if start != -1: - yield Span(self, start, i, label=label) - start = i - label = token.ent_type - if start != -1: - yield Span(self, start, self.length, label=label) + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') + >>> ents = list(tokens.ents) + >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) + (112504, u'PERSON', u'Best ') + """ + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef int label = 0 + output = [] + for i in range(self.length): + token = &self.data[i] + if token.ent_iob == 1: + assert start != -1 + elif token.ent_iob == 2 or token.ent_iob == 0: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = -1 + label = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = i + label = token.ent_type + if start != -1: + output.append(Span(self, start, self.length, label=label)) + return tuple(output) + + def __set__(self, ents): + # TODO: + # 1. Allow negative matches + # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 3. Test basic data-driven ORTH gazetteer + # 4. Test more nuanced date and currency regex + cdef int i + for i in range(self.length): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 0 + cdef attr_t ent_type + cdef int start, end + for ent_type, start, end in ents: + if ent_type is None: + # Mark as O + for i in range(start, end): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 2 + else: + # Mark (inside) as I + for i in range(start, end): + self.data[i].ent_type = ent_type + self.data[i].ent_iob = 1 + # Set start as B + self.data[start].ent_iob = 3 @property def noun_chunks(self): diff --git a/tests/spans/test_merge.py b/tests/spans/test_merge.py index 3bba13064..e225db043 100644 --- a/tests/spans/test_merge.py +++ b/tests/spans/test_merge.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import pytest - @pytest.mark.models def test_merge_tokens(EN): tokens = EN(u'Los Angeles start.') @@ -32,3 +31,21 @@ def test_merge_heads(EN): def test_issue_54(EN): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = EN(text, merge_mwes=True) + +@pytest.mark.models +def test_np_merges(EN): + text = u'displaCy is a parse tool built with Javascript' + tokens = EN(text) + assert tokens[4].head.i == 1 + tokens.merge(tokens[2].idx, tokens[4].idx + len(tokens[4]), u'NP', u'tool', u'O') + assert tokens[2].head.i == 1 + tokens = EN('displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.') + + ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) + for e in tokens.ents] + for start, end, label, lemma in ents: + merged = tokens.merge(start, end, label, lemma, label) + assert merged != None, (start, end, label, lemma) + for tok in tokens: + print tok.orth_, tok.dep_, tok.head.orth_ + diff --git a/tests/test_matcher.py b/tests/test_matcher.py index fb3665623..06950253c 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -47,5 +47,14 @@ def test_match_multi(matcher, EN): assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4), (EN.vocab.strings['product'], 5, 6)] -def test_dummy(): - pass +def test_match_preserved(matcher, EN): + doc = EN.tokenizer('I like Java') + EN.tagger(doc) + EN.entity(doc) + assert len(doc.ents) == 0 + doc = EN.tokenizer('I like Java') + matcher(doc) + assert len(doc.ents) == 1 + EN.tagger(doc) + EN.entity(doc) + assert len(doc.ents) == 1 diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index b935bbce7..e1238373f 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -4,7 +4,6 @@ from spacy.tokens import Doc import pytest - @pytest.mark.models def test_getitem(EN): tokens = EN(u'Give it back! He pleaded.') @@ -32,3 +31,15 @@ def test_serialize_whitespace(EN): assert tokens.string == new_tokens.string assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] + + +def test_set_ents(EN): + tokens = EN.tokenizer(u'I use goggle chrone to surf the web') + assert len(tokens.ents) == 0 + tokens.ents = [(EN.vocab.strings['PRODUCT'], 2, 4)] + assert len(list(tokens.ents)) == 1 + assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + ent = tokens.ents[0] + assert ent.label_ == 'PRODUCT' + assert ent.start == 2 + assert ent.end == 4