diff --git a/spacy/language.py b/spacy/language.py index bd1e8d012..f04da7d30 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -17,6 +17,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter +from .pipeline import merge_noun_chunks, merge_entities from .compat import json_dumps, izip, basestring_ from .gold import GoldParse from .scorer import Scorer @@ -105,7 +106,9 @@ class Language(object): 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), 'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), - 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg) + 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), + 'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks, + 'merge_entities': lambda nlp, **cfg: merge_entities } def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index cbd58281e..743f6ac85 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -69,6 +69,34 @@ class SentenceSegmenter(object): yield doc[start:len(doc)] +def merge_noun_chunks(doc): + """Merge noun chunks into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun chunks. + """ + if not doc.is_parsed: + return + spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep) + for np in doc.noun_chunks] + for start, end, tag, dep in spans: + doc.merge(start, end, tag=tag, dep=dep) + return doc + + +def merge_entities(doc): + """Merge entities into a single token. + + doc (Doc): The Doc object. + RETURNS (Doc): The Doc object with merged noun entities. + """ + spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) + for e in doc.ents] + for start, end, tag, dep, ent_type in spans: + doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type) + return doc + + class Pipe(object): """This class is not instantiated directly. Components inherit from it, and it defines the interface that components should follow to function as diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py new file mode 100644 index 000000000..35c42ce56 --- /dev/null +++ b/spacy/tests/pipeline/test_factories.py @@ -0,0 +1,44 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ..util import get_doc +from ...language import Language +from ...tokens import Span +from ... import util + +@pytest.fixture +def doc(en_tokenizer): + text = 'I like New York in Autumn.' + heads = [1, 0, 1, -2, -3, -1, -5] + tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT'] + deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] + tokens = en_tokenizer(text) + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads, + tags=tags, pos=pos, deps=deps) + doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])] + doc.is_parsed = True + doc.is_tagged = True + return doc + + +def test_factories_merge_noun_chunks(doc): + assert len(doc) == 7 + nlp = Language() + merge_noun_chunks = nlp.create_pipe('merge_noun_chunks') + merge_noun_chunks(doc) + assert len(doc) == 6 + assert doc[2].text == 'New York' + + +def test_factories_merge_ents(doc): + assert len(doc) == 7 + assert len(list(doc.ents)) == 1 + nlp = Language() + merge_entities = nlp.create_pipe('merge_entities') + merge_entities(doc) + assert len(doc) == 6 + assert len(list(doc.ents)) == 1 + assert doc[2].text == 'New York'