spaCy/spacy/pipeline/functions.py

# coding: utf8
from __future__ import unicode_literals

from ..matcher import Matcher


# TODO: replace doc.merge with doc.retokenize


def merge_noun_chunks(doc):
    """Merge noun chunks into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun chunks.
    """
    if not doc.is_parsed:
        return doc
    spans = [
        (np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
    ]
    for start, end, tag, dep in spans:
        doc.merge(start, end, tag=tag, dep=dep)
    return doc


def merge_entities(doc):
    """Merge entities into a single token.

    doc (Doc): The Doc object.
    RETURNS (Doc): The Doc object with merged noun entities.
    """
    spans = [
        (e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
    ]
    for start, end, tag, dep, ent_type in spans:
        doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
    return doc


def merge_subtokens(doc, label="subtok"):
    merger = Matcher(doc.vocab)
    merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
    matches = merger(doc)
    spans = [doc[start : end + 1] for _, start, end in matches]
    offsets = [(span.start_char, span.end_char) for span in spans]
    for start_char, end_char in offsets:
        doc.merge(start_char, end_char)
    return doc