From 2c37811c34758642d1549c9d68273f59fae36e98 Mon Sep 17 00:00:00 2001
From: India Kerle <india.kerle@nesta.org.uk>
Date: Mon, 19 Feb 2024 12:39:40 +0000
Subject: [PATCH] add coordination ruler

---
 spacy/pipeline/__init__.py                    |   2 +
 spacy/pipeline/coordinationruler.py           | 321 ++++++++++++++++++
 .../tests/pipeline/test_coordinationruler.py  |  66 ++++
 3 files changed, 389 insertions(+)
 create mode 100644 spacy/pipeline/coordinationruler.py
 create mode 100644 spacy/tests/pipeline/test_coordinationruler.py

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
index 2c4a5a8a8..02c900310 100644
--- a/spacy/pipeline/__init__.py
+++ b/spacy/pipeline/__init__.py
@@ -1,4 +1,5 @@
 from .attributeruler import AttributeRuler
+from .coordinationruler import CoordinationSplitter
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
@@ -21,6 +22,7 @@ from .trainable_pipe import TrainablePipe
 
 __all__ = [
     "AttributeRuler",
+    "CoordinationSplitter",
     "DependencyParser",
     "EditTreeLemmatizer",
     "EntityLinker",
diff --git a/spacy/pipeline/coordinationruler.py b/spacy/pipeline/coordinationruler.py
new file mode 100644
index 000000000..f2b62ac85
--- /dev/null
+++ b/spacy/pipeline/coordinationruler.py
@@ -0,0 +1,321 @@
+from typing import List, Callable, Optional, Union
+from pydantic import BaseModel, validator
+import re
+import en_core_web_sm
+
+from ..tokens import Doc
+from ..language import Language
+from ..vocab import Vocab
+from .pipe import Pipe
+
+########### DEFAULT COORDINATION SPLITTING RULES ##############
+
+
+def _split_duplicate_object(doc: Doc) -> Union[List[str], None]:
+    """Split a text with 2 verbs and 1 object (and optionally a subject) into
+       2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
+
+    i.e. 'I use and provide clinical supervision' -->
+    ['I use clinical supervision', 'I provide clinical supervision']
+
+    Args:
+        doc (Doc): The spaCy Doc object.
+
+    Returns:
+        List[str]: The split texts.
+    """
+    sentences = []
+
+    for token in doc:
+        if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"):
+
+            has_AND = False
+            has_second_verb = False
+            has_dobj = False
+            subject = None
+
+            # Find the subject if it exists
+            for possible_subject in token.head.children:
+                if possible_subject.dep_ in ["nsubj", "nsubjpass"]:
+                    subject = possible_subject
+                    break
+
+            for child in token.children:
+
+                if child.pos_ == "CCONJ" and child.lemma_ == "and":
+                    has_AND = True
+
+                if child.pos_ == "VERB" and child.dep_ == "conj":
+                    has_second_verb = True
+                    second_verb = child
+                    first_verb = token.head if token.dep_ == "conj" else token
+
+                    for descendant in second_verb.subtree:
+                        if descendant.dep_ == "dobj":
+                            has_dobj = True
+                            # Collect the full noun phrase for the direct object
+                            dobj_span = doc[
+                                descendant.left_edge.i : descendant.right_edge.i + 1
+                            ]
+                            dobj = dobj_span.text
+
+            if has_AND and has_second_verb and has_dobj:
+                subject_text = subject.text + " " if subject else ""
+                first_text = "{}{} {}".format(subject_text, first_verb, dobj)
+                second_text = "{}{} {}".format(subject_text, second_verb, dobj)
+
+                sentences.extend([first_text, second_text])
+
+    return sentences if sentences else None
+
+
+def _split_on_and(text: str) -> List[str]:
+    """Split a text on 'and' and return a list of the split texts.
+
+    Args:
+        text (str): The text to split.
+
+    Returns:
+        List[str]: The split texts.
+    """
+    text = re.sub(r"\s\s+", " ", text)
+
+    replacements = {
+        ";": ",",
+        ", and ,": " and ",
+        ", and,": " and ",
+        ",and ,": " and ",
+        ", and ": " and ",
+        " and ,": " and ",
+        ",and,": " and ",
+        " and,": " and ",
+        ",and ": " and ",
+    }
+    for old, new in replacements.items():
+        text = text.replace(old, new)
+
+    return [t.strip() for t in re.split(r",| and ", text)]
+
+
+def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]:
+    """Split a text with 1 verb and 2 objects.
+
+    i.e. 'I love using smartphones and apps' -->
+    ['I love using smartphones', 'I love using apps']
+
+    Args:
+        doc (Doc): The spaCy Doc object.
+
+    Returns:
+        List[str]: The split texts.
+    """
+
+    for token in doc:
+
+        if token.pos_ == "VERB" and token.dep_ == "ROOT":
+
+            has_AND = False
+            has_dobj = False
+            has_sec_obj = False
+            subject = ""
+
+            for child in token.children:
+
+                if child.dep_ == "dobj":
+                    has_dobj = True
+
+                subject = child.text if child.dep_ == "nsubj" else subject
+
+                objects = " ".join(
+                    [
+                        c.text
+                        for c in token.subtree
+                        if c.text != token.text and c.dep_ != "nsubj"
+                    ]
+                )
+
+                split_objects = _split_on_and(objects)
+
+                object_list = []
+                for split in split_objects:
+                    object_list.append(split)
+
+                for subchild in child.children:
+
+                    if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and":
+                        has_AND = True
+
+                    if subchild.dep_ == "conj":
+                        has_sec_obj = True
+
+                if has_AND and has_dobj and has_sec_obj:
+                    text_list = [
+                        f"{subject} {token.text} {split}.".strip()
+                        for split in object_list
+                    ]
+                    return [text.replace(" ..", ".") for text in text_list]
+
+    return None
+
+
+def _split_skill_mentions(doc: Doc) -> Union[List[str], None]:
+    """Split a text with 2 skills into 2 texts with 1 skill.
+
+        i.e. 'written and oral communication skills' -->
+    ['written communication skills', 'oral communication skills']
+
+    Args:
+        text (str): The text to split.
+
+    Returns:
+        List[str]: The split texts.
+    """
+    for token in doc:
+        if (
+            token.pos_ == "NOUN"
+            and token.lemma_ == "skill"
+            and token.idx == doc[-1].idx
+        ):
+
+            has_AND = False
+
+            root = [token for token in doc if token.dep_ == "ROOT"]
+            if root:
+                root = root[0]
+
+                for child in root.subtree:
+
+                    if child.pos_ == "CCONJ" and child.lemma_ == "and":
+                        has_AND = True
+
+                if has_AND:
+                    skill_def = " ".join(
+                        [c.text for c in root.subtree if c.text != token.text]
+                    )
+
+                    split_skills = _split_on_and(skill_def)
+
+                    skill_lists = []
+                    for split_skill in split_skills:
+                        skill_lists.append("{} {}".format(split_skill, token.text))
+
+                    return skill_lists
+    return None
+
+
+class SplittingRule(BaseModel):
+    function: Callable[[Doc], Union[List[str], None]]
+
+    @validator("function")
+    def check_return_type(cls, v):
+        nlp = en_core_web_sm.load()
+        dummy_doc = nlp("This is a dummy sentence.")
+        result = v(dummy_doc)
+        if result is not None:
+            if not isinstance(result, List):
+                raise ValueError(
+                    "The custom splitting rule must return None or a list."
+                )
+            elif not all(isinstance(item, str) for item in result):
+                raise ValueError(
+                    "The custom splitting rule must return None or a list of strings."
+                )
+        return v
+
+
+@Language.factory(
+    "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
+)
+def make_coordination_splitter(nlp: Language, name: str):
+    """Make a CoordinationSplitter component.
+
+    the default splitting rules include:
+
+    - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
+    - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
+    - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
+
+
+    Args:
+        nlp (Language): The spaCy Language object.
+        name (str): The name of the component.
+
+    RETURNS The CoordinationSplitter component.
+
+    DOCS: xxx
+    """
+
+    return CoordinationSplitter(nlp.vocab, name=name)
+
+
+class CoordinationSplitter(Pipe):
+    def __init__(
+        self,
+        vocab: Vocab,
+        name: str = "coordination_splitter",
+        rules: Optional[List[SplittingRule]] = None,
+    ) -> None:
+        self.name = name
+        self.vocab = vocab
+        if rules is None:
+            default_rules = [
+                _split_duplicate_object,
+                _split_duplicate_verb,
+                _split_skill_mentions,
+            ]
+            self.rules = [SplittingRule(function=rule) for rule in default_rules]
+        else:
+            # Ensure provided rules are wrapped in SplittingRule instances
+            self.rules = [
+                rule
+                if isinstance(rule, SplittingRule)
+                else SplittingRule(function=rule)
+                for rule in rules
+            ]
+
+    def clear_rules(self) -> None:
+        """Clear the default splitting rules."""
+        self.rules = []
+
+    def add_default_rules(self) -> List[SplittingRule]:
+        """Reset the default splitting rules."""
+        default_rules = [
+            _split_duplicate_object,
+            _split_duplicate_verb,
+            _split_skill_mentions,
+        ]
+        self.rules = [SplittingRule(function=rule) for rule in default_rules]
+
+    def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
+        """Add a single splitting rule to the default rules."""
+        validated_rule = SplittingRule(function=rule)
+        self.rules.append(validated_rule)
+
+    def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
+        """Add a list of splitting rules to the default rules.
+
+        Args:
+            rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
+        """
+        for rule in rules:
+            # Wrap each rule in a SplittingRule instance to ensure it's validated
+            validated_rule = SplittingRule(function=rule)
+            self.rules.append(validated_rule)
+
+    def __call__(self, doc: Doc) -> Doc:
+        """Apply the splitting rules to the doc.
+
+        Args:
+            doc (Doc): The spaCy Doc object.
+
+        Returns:
+            Doc: The modified spaCy Doc object.
+        """
+        if doc.lang_ != "en":
+            return doc
+
+        for rule in self.rules:
+            split = rule.function(doc)
+            if split:
+                return Doc(doc.vocab, words=split)
+        return doc
diff --git a/spacy/tests/pipeline/test_coordinationruler.py b/spacy/tests/pipeline/test_coordinationruler.py
new file mode 100644
index 000000000..be439e9c5
--- /dev/null
+++ b/spacy/tests/pipeline/test_coordinationruler.py
@@ -0,0 +1,66 @@
+import pytest
+from typing import List
+from spacy.tokens import Doc
+
+import en_core_web_sm
+
+
+@pytest.fixture
+def nlp():
+    return en_core_web_sm.load()
+
+
+def _my_custom_splitting_rule(doc: Doc) -> List[str]:
+    split_phrases = []
+    for token in doc:
+        if token.text == "read":
+            split_phrases.append("test1")
+            split_phrases.append("test2")
+    return split_phrases
+
+
+def test_coordinationruler(nlp):
+    doc = nlp("I read and write books")
+    assert len(doc) == 5
+    assert [d.text for d in doc] == ["I", "read", "and", "write", "books"]
+    coord_splitter = nlp.add_pipe("coordination_splitter")
+    assert len(coord_splitter.rules) == 3
+    assert coord_splitter.name == "coordination_splitter"
+    doc_split = coord_splitter(doc)
+    assert len(doc_split) == 2
+    assert [t.text for t in doc_split] == ["I read books", "I write books"]
+
+
+def test_coordinationruler_clear_rules(nlp):
+    coord_splitter = nlp.add_pipe("coordination_splitter")
+    assert len(coord_splitter.rules) == 3
+    coord_splitter.clear_rules()
+    assert len(coord_splitter.rules) == 0
+    assert coord_splitter.rules == []
+
+
+def test_coordinationruler_add_rule(nlp):
+    coord_splitter = nlp.add_pipe("coordination_splitter")
+    assert len(coord_splitter.rules) == 3
+    coord_splitter.add_rule(_my_custom_splitting_rule)
+    assert len(coord_splitter.rules) == 4
+
+
+def test_coordinationruler_add_rules(nlp):
+    doc = nlp("I read and write books")
+    coord_splitter = nlp.add_pipe("coordination_splitter")
+    coord_splitter.clear_rules()
+    coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule])
+    assert len(coord_splitter.rules) == 2
+    doc_split = coord_splitter(doc)
+    assert len(doc_split) == 2
+
+    assert [t.text for t in doc_split] == ["test1", "test2"]
+
+
+def test_coordinationruler_add_default_rules(nlp):
+    coord_splitter = nlp.add_pipe("coordination_splitter")
+    coord_splitter.clear_rules()
+    assert len(coord_splitter.rules) == 0
+    coord_splitter.add_default_rules()
+    assert len(coord_splitter.rules) == 3