Add built-in factories for merge_entities and merge_noun_chunks

Allows adding those components to the pipeline out-of-the-box if they're defined in a model's meta.json. Also allows usage as nlp.add_pipe(nlp.create_pipe('merge_entities')).
This commit is contained in:
ines 2018-03-15 00:18:51 +01:00
parent 9ad5df41fe
commit d854f69fe3
3 changed files with 76 additions and 1 deletions

View File

@ -17,6 +17,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities
from .compat import json_dumps, izip, basestring_ from .compat import json_dumps, izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
@ -105,7 +106,9 @@ class Language(object):
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), 'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg) 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
'merge_entities': lambda nlp, **cfg: merge_entities
} }
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):

View File

@ -69,6 +69,34 @@ class SentenceSegmenter(object):
yield doc[start:len(doc)] yield doc[start:len(doc)]
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun chunks.
"""
if not doc.is_parsed:
return
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
for np in doc.noun_chunks]
for start, end, tag, dep in spans:
doc.merge(start, end, tag=tag, dep=dep)
return doc
def merge_entities(doc):
"""Merge entities into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun entities.
"""
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
for e in doc.ents]
for start, end, tag, dep, ent_type in spans:
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
return doc
class Pipe(object): class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and """This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as it defines the interface that components should follow to function as

View File

@ -0,0 +1,44 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ..util import get_doc
from ...language import Language
from ...tokens import Span
from ... import util
@pytest.fixture
def doc(en_tokenizer):
text = 'I like New York in Autumn.'
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads,
tags=tags, pos=pos, deps=deps)
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
doc.is_parsed = True
doc.is_tagged = True
return doc
def test_factories_merge_noun_chunks(doc):
assert len(doc) == 7
nlp = Language()
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
merge_noun_chunks(doc)
assert len(doc) == 6
assert doc[2].text == 'New York'
def test_factories_merge_ents(doc):
assert len(doc) == 7
assert len(list(doc.ents)) == 1
nlp = Language()
merge_entities = nlp.create_pipe('merge_entities')
merge_entities(doc)
assert len(doc) == 6
assert len(list(doc.ents)) == 1
assert doc[2].text == 'New York'