From c2f475925718a73fcb13070d8039a7bb59e1e3ec Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 12 Mar 2018 23:03:05 +0100 Subject: [PATCH 1/6] Fix test for Python 2 --- spacy/tests/parser/test_arc_eager_oracle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 9c235b832..5f3a553e2 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals from ...vocab import Vocab from ...pipeline import DependencyParser from ...tokens import Doc From cca66abf1ed161ead177404f38cb7c0ee9365922 Mon Sep 17 00:00:00 2001 From: DuyguA Date: Wed, 14 Mar 2018 11:34:22 +0100 Subject: [PATCH 2/6] quick typo fix --- spacy/lang/en/lemmatizer/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/en/lemmatizer/lookup.py b/spacy/lang/en/lemmatizer/lookup.py index 86c1a89d3..66ab2b70b 100644 --- a/spacy/lang/en/lemmatizer/lookup.py +++ b/spacy/lang/en/lemmatizer/lookup.py @@ -1310,7 +1310,7 @@ LOOKUP = { "alphabets": "alphabet", "alphas": "alpha", "alpines": "alpine", - "also": "conjurer", + "also": "also", "also-rans": "also-ran", "altars": "altar", "alterations": "alteration", From 1a513f71e3397b7013044fab7df1bd41248ba526 Mon Sep 17 00:00:00 2001 From: DuyguA Date: Wed, 14 Mar 2018 11:57:15 +0100 Subject: [PATCH 3/6] removed also from lookup --- spacy/lang/en/lemmatizer/lookup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/lang/en/lemmatizer/lookup.py b/spacy/lang/en/lemmatizer/lookup.py index 66ab2b70b..063cf4cf4 100644 --- a/spacy/lang/en/lemmatizer/lookup.py +++ b/spacy/lang/en/lemmatizer/lookup.py @@ -1310,7 +1310,6 @@ LOOKUP = { "alphabets": "alphabet", "alphas": "alpha", "alpines": "alpine", - "also": "also", "also-rans": "also-ran", "altars": "altar", "alterations": "alteration", From be4f6da16bb6c19c1a2ebf78cd5aa85a91a93b36 Mon Sep 17 00:00:00 2001 From: DuyguA Date: Wed, 14 Mar 2018 14:45:57 +0100 Subject: [PATCH 4/6] maybe not a good idea to remove also --- spacy/lang/en/lemmatizer/lookup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/lang/en/lemmatizer/lookup.py b/spacy/lang/en/lemmatizer/lookup.py index 063cf4cf4..66ab2b70b 100644 --- a/spacy/lang/en/lemmatizer/lookup.py +++ b/spacy/lang/en/lemmatizer/lookup.py @@ -1310,6 +1310,7 @@ LOOKUP = { "alphabets": "alphabet", "alphas": "alpha", "alpines": "alpine", + "also": "also", "also-rans": "also-ran", "altars": "altar", "alterations": "alteration", From 9ad5df41fefaa858f4b1aebb06f789a2ac8ffef6 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 15 Mar 2018 00:11:18 +0100 Subject: [PATCH 5/6] Fix whitespace --- spacy/pipeline.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 6fbf95eea..a647521d2 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -139,7 +139,7 @@ class Pipe(object): problem. """ raise NotImplementedError - + def create_optimizer(self): return create_default_optimizer(self.model.ops, **self.cfg.get('optimizer', {})) @@ -935,7 +935,7 @@ cdef class DependencyParser(Parser): @property def postprocesses(self): return [nonproj.deprojectivize] - + def add_multitask_objective(self, target): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) @@ -956,7 +956,7 @@ cdef class EntityRecognizer(Parser): TransitionSystem = BiluoPushDown nr_feature = 6 - + def add_multitask_objective(self, target): labeller = MultitaskObjective(self.vocab, target=target) self._multitasks.append(labeller) From d854f69fe3fd32e3ee34100e2f0410df2389222c Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 15 Mar 2018 00:18:51 +0100 Subject: [PATCH 6/6] Add built-in factories for merge_entities and merge_noun_chunks Allows adding those components to the pipeline out-of-the-box if they're defined in a model's meta.json. Also allows usage as nlp.add_pipe(nlp.create_pipe('merge_entities')). --- spacy/language.py | 5 ++- spacy/pipeline.pyx | 28 ++++++++++++++++ spacy/tests/pipeline/test_factories.py | 44 ++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 spacy/tests/pipeline/test_factories.py diff --git a/spacy/language.py b/spacy/language.py index bd1e8d012..f04da7d30 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,6 +17,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter +from .pipeline import merge_noun_chunks, merge_entities from .compat import json_dumps, izip, basestring_ from .gold import GoldParse from .scorer import Scorer @@ -105,7 +106,9 @@ class Language(object): 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), 'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), - 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg) + 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), + 'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks, + 'merge_entities': lambda nlp, **cfg: merge_entities } def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index a647521d2..f4a654591 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -69,6 +69,34 @@ class SentenceSegmenter(object): yield doc[start:len(doc)] +def merge_noun_chunks(doc): + """Merge noun chunks into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun chunks. + """ + if not doc.is_parsed: + return + spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep) + for np in doc.noun_chunks] + for start, end, tag, dep in spans: + doc.merge(start, end, tag=tag, dep=dep) + return doc + + +def merge_entities(doc): + """Merge entities into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun entities. + """ + spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) + for e in doc.ents] + for start, end, tag, dep, ent_type in spans: + doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) + return doc + + class Pipe(object): """This class is not instantiated directly. Components inherit from it, and it defines the interface that components should follow to function as diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py new file mode 100644 index 000000000..35c42ce56 --- /dev/null +++ b/spacy/tests/pipeline/test_factories.py @@ -0,0 +1,44 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ..util import get_doc +from ...language import Language +from ...tokens import Span +from ... import util + +@pytest.fixture +def doc(en_tokenizer): + text = 'I like New York in Autumn.' + heads = [1, 0, 1, -2, -3, -1, -5] + tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT'] + deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, + tags=tags, pos=pos, deps=deps) + doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])] + doc.is_parsed = True + doc.is_tagged = True + return doc + + +def test_factories_merge_noun_chunks(doc): + assert len(doc) == 7 + nlp = Language() + merge_noun_chunks = nlp.create_pipe('merge_noun_chunks') + merge_noun_chunks(doc) + assert len(doc) == 6 + assert doc[2].text == 'New York' + + +def test_factories_merge_ents(doc): + assert len(doc) == 7 + assert len(list(doc.ents)) == 1 + nlp = Language() + merge_entities = nlp.create_pipe('merge_entities') + merge_entities(doc) + assert len(doc) == 6 + assert len(list(doc.ents)) == 1 + assert doc[2].text == 'New York'