spaCy/spacy/pipeline/functions.py

# coding: utf8
from __future__ import unicode_literals

from ..matcher import Matcher


def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun chunks.
    """
    if not doc.is_parsed:
        return doc
    with doc.retokenize() as retokenizer:
        for np in doc.noun_chunks:
            attrs = {"tag": np.root.tag, "dep": np.root.dep}
            retokenizer.merge(np, attrs=attrs)
    return doc


def merge_entities(doc):
    """Merge entities into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}
            retokenizer.merge(ent, attrs=attrs)
    return doc


def merge_subtokens(doc, label="subtok"):
    merger = Matcher(doc.vocab)
    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
    matches = merger(doc)
    spans = [doc[start : end + 1] for _, start, end in matches]
    with doc.retokenize() as retokenizer:
        for span in spans:
            retokenizer.merge(span)
    return doc
💫 Break up large pipeline.pyx (#3246) * Break up large pipeline.pyx * Merge some components back together * Fix typo 2019-02-10 14:14:51 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

			`from ..matcher import Matcher`


			`def merge_noun_chunks(doc):`
			`"""Merge noun chunks into a single token.`

			`doc (Doc): The Doc object.`
			`RETURNS (Doc): The Doc object with merged noun chunks.`
			`"""`
			`if not doc.is_parsed:`
			`return doc`
💫 Replace {Doc,Span}.merge with Doc.retokenize (#3280) * Add deprecation warning to Doc.merge and Span.merge * Replace {Doc,Span}.merge with Doc.retokenize 2019-02-15 12:29:44 +03:00			`with doc.retokenize() as retokenizer:`
			`for np in doc.noun_chunks:`
			`attrs = {"tag": np.root.tag, "dep": np.root.dep}`
			`retokenizer.merge(np, attrs=attrs)`
💫 Break up large pipeline.pyx (#3246) * Break up large pipeline.pyx * Merge some components back together * Fix typo 2019-02-10 14:14:51 +03:00			`return doc`


			`def merge_entities(doc):`
			`"""Merge entities into a single token.`

			`doc (Doc): The Doc object.`
			`RETURNS (Doc): The Doc object with merged noun entities.`
			`"""`
💫 Replace {Doc,Span}.merge with Doc.retokenize (#3280) * Add deprecation warning to Doc.merge and Span.merge * Replace {Doc,Span}.merge with Doc.retokenize 2019-02-15 12:29:44 +03:00			`with doc.retokenize() as retokenizer:`
			`for ent in doc.ents:`
			`attrs = {"tag": ent.root.tag, "dep": ent.root.dep, "ent_type": ent.label}`
			`retokenizer.merge(ent, attrs=attrs)`
💫 Break up large pipeline.pyx (#3246) * Break up large pipeline.pyx * Merge some components back together * Fix typo 2019-02-10 14:14:51 +03:00			`return doc`


			`def merge_subtokens(doc, label="subtok"):`
			`merger = Matcher(doc.vocab)`
			`merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])`
			`matches = merger(doc)`
			`spans = [doc[start : end + 1] for _, start, end in matches]`
💫 Replace {Doc,Span}.merge with Doc.retokenize (#3280) * Add deprecation warning to Doc.merge and Span.merge * Replace {Doc,Span}.merge with Doc.retokenize 2019-02-15 12:29:44 +03:00			`with doc.retokenize() as retokenizer:`
			`for span in spans:`
			`retokenizer.merge(span)`
💫 Break up large pipeline.pyx (#3246) * Break up large pipeline.pyx * Merge some components back together * Fix typo 2019-02-10 14:14:51 +03:00			`return doc`