mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Merge remote-tracking branch 'origin/master' into feature/single-thread
This commit is contained in:
commit
e85dd038fe
|
@ -1310,7 +1310,7 @@ LOOKUP = {
|
||||||
"alphabets": "alphabet",
|
"alphabets": "alphabet",
|
||||||
"alphas": "alpha",
|
"alphas": "alpha",
|
||||||
"alpines": "alpine",
|
"alpines": "alpine",
|
||||||
"also": "conjurer",
|
"also": "also",
|
||||||
"also-rans": "also-ran",
|
"also-rans": "also-ran",
|
||||||
"altars": "altar",
|
"altars": "altar",
|
||||||
"alterations": "alteration",
|
"alterations": "alteration",
|
||||||
|
|
|
@ -17,6 +17,7 @@ from .vocab import Vocab
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
|
||||||
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
|
||||||
|
from .pipeline import merge_noun_chunks, merge_entities
|
||||||
from .compat import json_dumps, izip, basestring_
|
from .compat import json_dumps, izip, basestring_
|
||||||
from .gold import GoldParse
|
from .gold import GoldParse
|
||||||
from .scorer import Scorer
|
from .scorer import Scorer
|
||||||
|
@ -105,7 +106,9 @@ class Language(object):
|
||||||
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||||
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
||||||
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
'sbd': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||||
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg)
|
'sentencizer': lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
|
||||||
|
'merge_noun_chunks': lambda nlp, **cfg: merge_noun_chunks,
|
||||||
|
'merge_entities': lambda nlp, **cfg: merge_entities
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
|
||||||
|
|
|
@ -69,6 +69,34 @@ class SentenceSegmenter(object):
|
||||||
yield doc[start:len(doc)]
|
yield doc[start:len(doc)]
|
||||||
|
|
||||||
|
|
||||||
|
def merge_noun_chunks(doc):
|
||||||
|
"""Merge noun chunks into a single token.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object.
|
||||||
|
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||||
|
"""
|
||||||
|
if not doc.is_parsed:
|
||||||
|
return
|
||||||
|
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
|
||||||
|
for np in doc.noun_chunks]
|
||||||
|
for start, end, tag, dep in spans:
|
||||||
|
doc.merge(start, end, tag=tag, dep=dep)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def merge_entities(doc):
|
||||||
|
"""Merge entities into a single token.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object.
|
||||||
|
RETURNS (Doc): The Doc object with merged noun entities.
|
||||||
|
"""
|
||||||
|
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
|
||||||
|
for e in doc.ents]
|
||||||
|
for start, end, tag, dep, ent_type in spans:
|
||||||
|
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
"""This class is not instantiated directly. Components inherit from it, and
|
"""This class is not instantiated directly. Components inherit from it, and
|
||||||
it defines the interface that components should follow to function as
|
it defines the interface that components should follow to function as
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import unicode_literals
|
||||||
from ...vocab import Vocab
|
from ...vocab import Vocab
|
||||||
from ...pipeline import DependencyParser
|
from ...pipeline import DependencyParser
|
||||||
from ...tokens import Doc
|
from ...tokens import Doc
|
||||||
|
|
44
spacy/tests/pipeline/test_factories.py
Normal file
44
spacy/tests/pipeline/test_factories.py
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from ..util import get_doc
|
||||||
|
from ...language import Language
|
||||||
|
from ...tokens import Span
|
||||||
|
from ... import util
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def doc(en_tokenizer):
|
||||||
|
text = 'I like New York in Autumn.'
|
||||||
|
heads = [1, 0, 1, -2, -3, -1, -5]
|
||||||
|
tags = ['PRP', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.']
|
||||||
|
pos = ['PRON', 'VERB', 'PROPN', 'PROPN', 'ADP', 'PROPN', 'PUNCT']
|
||||||
|
deps = ['ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct']
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads,
|
||||||
|
tags=tags, pos=pos, deps=deps)
|
||||||
|
doc.ents = [Span(doc, 2, 4, doc.vocab.strings['GPE'])]
|
||||||
|
doc.is_parsed = True
|
||||||
|
doc.is_tagged = True
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def test_factories_merge_noun_chunks(doc):
|
||||||
|
assert len(doc) == 7
|
||||||
|
nlp = Language()
|
||||||
|
merge_noun_chunks = nlp.create_pipe('merge_noun_chunks')
|
||||||
|
merge_noun_chunks(doc)
|
||||||
|
assert len(doc) == 6
|
||||||
|
assert doc[2].text == 'New York'
|
||||||
|
|
||||||
|
|
||||||
|
def test_factories_merge_ents(doc):
|
||||||
|
assert len(doc) == 7
|
||||||
|
assert len(list(doc.ents)) == 1
|
||||||
|
nlp = Language()
|
||||||
|
merge_entities = nlp.create_pipe('merge_entities')
|
||||||
|
merge_entities(doc)
|
||||||
|
assert len(doc) == 6
|
||||||
|
assert len(list(doc.ents)) == 1
|
||||||
|
assert doc[2].text == 'New York'
|
Loading…
Reference in New Issue
Block a user