From 2c37811c34758642d1549c9d68273f59fae36e98 Mon Sep 17 00:00:00 2001 From: India Kerle Date: Mon, 19 Feb 2024 12:39:40 +0000 Subject: [PATCH] add coordination ruler --- spacy/pipeline/__init__.py | 2 + spacy/pipeline/coordinationruler.py | 321 ++++++++++++++++++ .../tests/pipeline/test_coordinationruler.py | 66 ++++ 3 files changed, 389 insertions(+) create mode 100644 spacy/pipeline/coordinationruler.py create mode 100644 spacy/tests/pipeline/test_coordinationruler.py diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 2c4a5a8a8..02c900310 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,4 +1,5 @@ from .attributeruler import AttributeRuler +from .coordinationruler import CoordinationSplitter from .dep_parser import DependencyParser from .edit_tree_lemmatizer import EditTreeLemmatizer from .entity_linker import EntityLinker @@ -21,6 +22,7 @@ from .trainable_pipe import TrainablePipe __all__ = [ "AttributeRuler", + "CoordinationSplitter", "DependencyParser", "EditTreeLemmatizer", "EntityLinker", diff --git a/spacy/pipeline/coordinationruler.py b/spacy/pipeline/coordinationruler.py new file mode 100644 index 000000000..f2b62ac85 --- /dev/null +++ b/spacy/pipeline/coordinationruler.py @@ -0,0 +1,321 @@ +from typing import List, Callable, Optional, Union +from pydantic import BaseModel, validator +import re +import en_core_web_sm + +from ..tokens import Doc +from ..language import Language +from ..vocab import Vocab +from .pipe import Pipe + +########### DEFAULT COORDINATION SPLITTING RULES ############## + + +def _split_duplicate_object(doc: Doc) -> Union[List[str], None]: + """Split a text with 2 verbs and 1 object (and optionally a subject) into + 2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present. + + i.e. 'I use and provide clinical supervision' --> + ['I use clinical supervision', 'I provide clinical supervision'] + + Args: + doc (Doc): The spaCy Doc object. + + Returns: + List[str]: The split texts. + """ + sentences = [] + + for token in doc: + if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"): + + has_AND = False + has_second_verb = False + has_dobj = False + subject = None + + # Find the subject if it exists + for possible_subject in token.head.children: + if possible_subject.dep_ in ["nsubj", "nsubjpass"]: + subject = possible_subject + break + + for child in token.children: + + if child.pos_ == "CCONJ" and child.lemma_ == "and": + has_AND = True + + if child.pos_ == "VERB" and child.dep_ == "conj": + has_second_verb = True + second_verb = child + first_verb = token.head if token.dep_ == "conj" else token + + for descendant in second_verb.subtree: + if descendant.dep_ == "dobj": + has_dobj = True + # Collect the full noun phrase for the direct object + dobj_span = doc[ + descendant.left_edge.i : descendant.right_edge.i + 1 + ] + dobj = dobj_span.text + + if has_AND and has_second_verb and has_dobj: + subject_text = subject.text + " " if subject else "" + first_text = "{}{} {}".format(subject_text, first_verb, dobj) + second_text = "{}{} {}".format(subject_text, second_verb, dobj) + + sentences.extend([first_text, second_text]) + + return sentences if sentences else None + + +def _split_on_and(text: str) -> List[str]: + """Split a text on 'and' and return a list of the split texts. + + Args: + text (str): The text to split. + + Returns: + List[str]: The split texts. + """ + text = re.sub(r"\s\s+", " ", text) + + replacements = { + ";": ",", + ", and ,": " and ", + ", and,": " and ", + ",and ,": " and ", + ", and ": " and ", + " and ,": " and ", + ",and,": " and ", + " and,": " and ", + ",and ": " and ", + } + for old, new in replacements.items(): + text = text.replace(old, new) + + return [t.strip() for t in re.split(r",| and ", text)] + + +def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]: + """Split a text with 1 verb and 2 objects. + + i.e. 'I love using smartphones and apps' --> + ['I love using smartphones', 'I love using apps'] + + Args: + doc (Doc): The spaCy Doc object. + + Returns: + List[str]: The split texts. + """ + + for token in doc: + + if token.pos_ == "VERB" and token.dep_ == "ROOT": + + has_AND = False + has_dobj = False + has_sec_obj = False + subject = "" + + for child in token.children: + + if child.dep_ == "dobj": + has_dobj = True + + subject = child.text if child.dep_ == "nsubj" else subject + + objects = " ".join( + [ + c.text + for c in token.subtree + if c.text != token.text and c.dep_ != "nsubj" + ] + ) + + split_objects = _split_on_and(objects) + + object_list = [] + for split in split_objects: + object_list.append(split) + + for subchild in child.children: + + if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and": + has_AND = True + + if subchild.dep_ == "conj": + has_sec_obj = True + + if has_AND and has_dobj and has_sec_obj: + text_list = [ + f"{subject} {token.text} {split}.".strip() + for split in object_list + ] + return [text.replace(" ..", ".") for text in text_list] + + return None + + +def _split_skill_mentions(doc: Doc) -> Union[List[str], None]: + """Split a text with 2 skills into 2 texts with 1 skill. + + i.e. 'written and oral communication skills' --> + ['written communication skills', 'oral communication skills'] + + Args: + text (str): The text to split. + + Returns: + List[str]: The split texts. + """ + for token in doc: + if ( + token.pos_ == "NOUN" + and token.lemma_ == "skill" + and token.idx == doc[-1].idx + ): + + has_AND = False + + root = [token for token in doc if token.dep_ == "ROOT"] + if root: + root = root[0] + + for child in root.subtree: + + if child.pos_ == "CCONJ" and child.lemma_ == "and": + has_AND = True + + if has_AND: + skill_def = " ".join( + [c.text for c in root.subtree if c.text != token.text] + ) + + split_skills = _split_on_and(skill_def) + + skill_lists = [] + for split_skill in split_skills: + skill_lists.append("{} {}".format(split_skill, token.text)) + + return skill_lists + return None + + +class SplittingRule(BaseModel): + function: Callable[[Doc], Union[List[str], None]] + + @validator("function") + def check_return_type(cls, v): + nlp = en_core_web_sm.load() + dummy_doc = nlp("This is a dummy sentence.") + result = v(dummy_doc) + if result is not None: + if not isinstance(result, List): + raise ValueError( + "The custom splitting rule must return None or a list." + ) + elif not all(isinstance(item, str) for item in result): + raise ValueError( + "The custom splitting rule must return None or a list of strings." + ) + return v + + +@Language.factory( + "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"] +) +def make_coordination_splitter(nlp: Language, name: str): + """Make a CoordinationSplitter component. + + the default splitting rules include: + + - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present. + - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object. + - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and') + + + Args: + nlp (Language): The spaCy Language object. + name (str): The name of the component. + + RETURNS The CoordinationSplitter component. + + DOCS: xxx + """ + + return CoordinationSplitter(nlp.vocab, name=name) + + +class CoordinationSplitter(Pipe): + def __init__( + self, + vocab: Vocab, + name: str = "coordination_splitter", + rules: Optional[List[SplittingRule]] = None, + ) -> None: + self.name = name + self.vocab = vocab + if rules is None: + default_rules = [ + _split_duplicate_object, + _split_duplicate_verb, + _split_skill_mentions, + ] + self.rules = [SplittingRule(function=rule) for rule in default_rules] + else: + # Ensure provided rules are wrapped in SplittingRule instances + self.rules = [ + rule + if isinstance(rule, SplittingRule) + else SplittingRule(function=rule) + for rule in rules + ] + + def clear_rules(self) -> None: + """Clear the default splitting rules.""" + self.rules = [] + + def add_default_rules(self) -> List[SplittingRule]: + """Reset the default splitting rules.""" + default_rules = [ + _split_duplicate_object, + _split_duplicate_verb, + _split_skill_mentions, + ] + self.rules = [SplittingRule(function=rule) for rule in default_rules] + + def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None: + """Add a single splitting rule to the default rules.""" + validated_rule = SplittingRule(function=rule) + self.rules.append(validated_rule) + + def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None: + """Add a list of splitting rules to the default rules. + + Args: + rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules. + """ + for rule in rules: + # Wrap each rule in a SplittingRule instance to ensure it's validated + validated_rule = SplittingRule(function=rule) + self.rules.append(validated_rule) + + def __call__(self, doc: Doc) -> Doc: + """Apply the splitting rules to the doc. + + Args: + doc (Doc): The spaCy Doc object. + + Returns: + Doc: The modified spaCy Doc object. + """ + if doc.lang_ != "en": + return doc + + for rule in self.rules: + split = rule.function(doc) + if split: + return Doc(doc.vocab, words=split) + return doc diff --git a/spacy/tests/pipeline/test_coordinationruler.py b/spacy/tests/pipeline/test_coordinationruler.py new file mode 100644 index 000000000..be439e9c5 --- /dev/null +++ b/spacy/tests/pipeline/test_coordinationruler.py @@ -0,0 +1,66 @@ +import pytest +from typing import List +from spacy.tokens import Doc + +import en_core_web_sm + + +@pytest.fixture +def nlp(): + return en_core_web_sm.load() + + +def _my_custom_splitting_rule(doc: Doc) -> List[str]: + split_phrases = [] + for token in doc: + if token.text == "read": + split_phrases.append("test1") + split_phrases.append("test2") + return split_phrases + + +def test_coordinationruler(nlp): + doc = nlp("I read and write books") + assert len(doc) == 5 + assert [d.text for d in doc] == ["I", "read", "and", "write", "books"] + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 3 + assert coord_splitter.name == "coordination_splitter" + doc_split = coord_splitter(doc) + assert len(doc_split) == 2 + assert [t.text for t in doc_split] == ["I read books", "I write books"] + + +def test_coordinationruler_clear_rules(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 3 + coord_splitter.clear_rules() + assert len(coord_splitter.rules) == 0 + assert coord_splitter.rules == [] + + +def test_coordinationruler_add_rule(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 3 + coord_splitter.add_rule(_my_custom_splitting_rule) + assert len(coord_splitter.rules) == 4 + + +def test_coordinationruler_add_rules(nlp): + doc = nlp("I read and write books") + coord_splitter = nlp.add_pipe("coordination_splitter") + coord_splitter.clear_rules() + coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule]) + assert len(coord_splitter.rules) == 2 + doc_split = coord_splitter(doc) + assert len(doc_split) == 2 + + assert [t.text for t in doc_split] == ["test1", "test2"] + + +def test_coordinationruler_add_default_rules(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + coord_splitter.clear_rules() + assert len(coord_splitter.rules) == 0 + coord_splitter.add_default_rules() + assert len(coord_splitter.rules) == 3