Add built-in factories for merge_entities and merge_noun_chunks

Allows adding those components to the pipeline out-of-the-box if they're defined in a model's meta.json. Also allows usage as nlp.add_pipe(nlp.create_pipe('merge_entities')).
This commit is contained in:
ines 2018-03-15 00:18:51 +01:00
parent 9ad5df41fe
commit d854f69fe3
3 changed files with 76 additions and 1 deletions

View File

@ -17,6 +17,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities
from .compat import json_dumps, izip, basestring_
from .gold import GoldParse
from .scorer import Scorer
@ -105,7 +106,9 @@ class Language(object):
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg)
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
'merge_entities': lambda nlp, **cfg: merge_entities
}
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):

View File

@ -69,6 +69,34 @@ class SentenceSegmenter(object):
yield doc[start:len(doc)]
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun chunks.
"""
if not doc.is_parsed:
return
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
for np in doc.noun_chunks]
for start, end, tag, dep in spans:
doc.merge(start, end, tag=tag, dep=dep)
return doc
def merge_entities(doc):
"""Merge entities into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun entities.
"""
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
for e in doc.ents]
for start, end, tag, dep, ent_type in spans:
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
return doc
class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as

View File

@ -0,0 +1,44 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ..util import get_doc
from ...language import Language
from ...tokens import Span
from ... import util
@pytest.fixture
def doc(en_tokenizer):
text = 'I like New York in Autumn.'
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads,
tags=tags, pos=pos, deps=deps)
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
doc.is_parsed = True
doc.is_tagged = True
return doc
def test_factories_merge_noun_chunks(doc):
assert len(doc) == 7
nlp = Language()
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
merge_noun_chunks(doc)
assert len(doc) == 6
assert doc[2].text == 'New York'
def test_factories_merge_ents(doc):
assert len(doc) == 7
assert len(list(doc.ents)) == 1
nlp = Language()
merge_entities = nlp.create_pipe('merge_entities')
merge_entities(doc)
assert len(doc) == 6
assert len(list(doc.ents)) == 1
assert doc[2].text == 'New York'