Merge remote-tracking branch 'origin/master' into feature/single-thread

This commit is contained in:
Matthew Honnibal 2018-03-16 02:41:11 +01:00
commit e85dd038fe
5 changed files with 81 additions and 5 deletions

View File

@ -1310,7 +1310,7 @@ LOOKUP = {
"alphabets": "alphabet", "alphabets": "alphabet",
"alphas": "alpha", "alphas": "alpha",
"alpines": "alpine", "alpines": "alpine",
"also": "conjurer", "also": "also",
"also-rans": "also-ran", "also-rans": "also-ran",
"altars": "altar", "altars": "altar",
"alterations": "alteration", "alterations": "alteration",

View File

@ -17,6 +17,7 @@ from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
from .pipeline import merge_noun_chunks, merge_entities
from .compat import json_dumps, izip, basestring_ from .compat import json_dumps, izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
@ -105,7 +106,9 @@ class Language(object):
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg), 'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg) 'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
'merge_entities': lambda nlp, **cfg: merge_entities
} }
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):

View File

@ -69,6 +69,34 @@ class SentenceSegmenter(object):
yield doc[start:len(doc)] yield doc[start:len(doc)]
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun chunks.
"""
if not doc.is_parsed:
return
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
for np in doc.noun_chunks]
for start, end, tag, dep in spans:
doc.merge(start, end, tag=tag, dep=dep)
return doc
def merge_entities(doc):
"""Merge entities into a single token.
doc (Doc): The Doc object.
RETURNS (Doc): The Doc object with merged noun entities.
"""
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
for e in doc.ents]
for start, end, tag, dep, ent_type in spans:
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
return doc
class Pipe(object): class Pipe(object):
"""This class is not instantiated directly. Components inherit from it, and """This class is not instantiated directly. Components inherit from it, and
it defines the interface that components should follow to function as it defines the interface that components should follow to function as

View File

@ -1,3 +1,4 @@
from __future__ import unicode_literals
from ...vocab import Vocab from ...vocab import Vocab
from ...pipeline import DependencyParser from ...pipeline import DependencyParser
from ...tokens import Doc from ...tokens import Doc

View File

@ -0,0 +1,44 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ..util import get_doc
from ...language import Language
from ...tokens import Span
from ... import util
@pytest.fixture
def doc(en_tokenizer):
text = 'I like New York in Autumn.'
heads = [1, 0, 1, -2, -3, -1, -5]
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
tokens = en_tokenizer(text)
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads,
tags=tags, pos=pos, deps=deps)
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
doc.is_parsed = True
doc.is_tagged = True
return doc
def test_factories_merge_noun_chunks(doc):
assert len(doc) == 7
nlp = Language()
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
merge_noun_chunks(doc)
assert len(doc) == 6
assert doc[2].text == 'New York'
def test_factories_merge_ents(doc):
assert len(doc) == 7
assert len(list(doc.ents)) == 1
nlp = Language()
merge_entities = nlp.create_pipe('merge_entities')
merge_entities(doc)
assert len(doc) == 6
assert len(list(doc.ents)) == 1
assert doc[2].text == 'New York'