from typing import List, Callable, Optional, Union from pydantic import BaseModel, validator import re import en_core_web_sm from ..tokens import Doc from ..language import Language from ..vocab import Vocab from .pipe import Pipe ########### DEFAULT COORDINATION SPLITTING RULES ############## def _split_duplicate_object(doc: Doc) -> Union[List[str], None]: """Split a text with 2 verbs and 1 object (and optionally a subject) into 2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present. i.e. 'I use and provide clinical supervision' --> ['I use clinical supervision', 'I provide clinical supervision'] Args: doc (Doc): The spaCy Doc object. Returns: List[str]: The split texts. """ sentences = [] for token in doc: if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"): has_AND = False has_second_verb = False has_dobj = False subject = None # Find the subject if it exists for possible_subject in token.head.children: if possible_subject.dep_ in ["nsubj", "nsubjpass"]: subject = possible_subject break for child in token.children: if child.pos_ == "CCONJ" and child.lemma_ == "and": has_AND = True if child.pos_ == "VERB" and child.dep_ == "conj": has_second_verb = True second_verb = child first_verb = token.head if token.dep_ == "conj" else token for descendant in second_verb.subtree: if descendant.dep_ == "dobj": has_dobj = True # Collect the full noun phrase for the direct object dobj_span = doc[ descendant.left_edge.i : descendant.right_edge.i + 1 ] dobj = dobj_span.text if has_AND and has_second_verb and has_dobj: subject_text = subject.text + " " if subject else "" first_text = "{}{} {}".format(subject_text, first_verb, dobj) second_text = "{}{} {}".format(subject_text, second_verb, dobj) sentences.extend([first_text, second_text]) return sentences if sentences else None def _split_on_and(text: str) -> List[str]: """Split a text on 'and' and return a list of the split texts. Args: text (str): The text to split. Returns: List[str]: The split texts. """ text = re.sub(r"\s\s+", " ", text) replacements = { ";": ",", ", and ,": " and ", ", and,": " and ", ",and ,": " and ", ", and ": " and ", " and ,": " and ", ",and,": " and ", " and,": " and ", ",and ": " and ", } for old, new in replacements.items(): text = text.replace(old, new) return [t.strip() for t in re.split(r",| and ", text)] def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]: """Split a text with 1 verb and 2 objects. i.e. 'I love using smartphones and apps' --> ['I love using smartphones', 'I love using apps'] Args: doc (Doc): The spaCy Doc object. Returns: List[str]: The split texts. """ for token in doc: if token.pos_ == "VERB" and token.dep_ == "ROOT": has_AND = False has_dobj = False has_sec_obj = False subject = "" for child in token.children: if child.dep_ == "dobj": has_dobj = True subject = child.text if child.dep_ == "nsubj" else subject objects = " ".join( [ c.text for c in token.subtree if c.text != token.text and c.dep_ != "nsubj" ] ) split_objects = _split_on_and(objects) object_list = [] for split in split_objects: object_list.append(split) for subchild in child.children: if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and": has_AND = True if subchild.dep_ == "conj": has_sec_obj = True if has_AND and has_dobj and has_sec_obj: text_list = [ f"{subject} {token.text} {split}.".strip() for split in object_list ] return [text.replace(" ..", ".") for text in text_list] return None def _split_skill_mentions(doc: Doc) -> Union[List[str], None]: """Split a text with 2 skills into 2 texts with 1 skill. i.e. 'written and oral communication skills' --> ['written communication skills', 'oral communication skills'] Args: text (str): The text to split. Returns: List[str]: The split texts. """ for token in doc: if ( token.pos_ == "NOUN" and token.lemma_ == "skill" and token.idx == doc[-1].idx ): has_AND = False root = [token for token in doc if token.dep_ == "ROOT"] if root: root = root[0] for child in root.subtree: if child.pos_ == "CCONJ" and child.lemma_ == "and": has_AND = True if has_AND: skill_def = " ".join( [c.text for c in root.subtree if c.text != token.text] ) split_skills = _split_on_and(skill_def) skill_lists = [] for split_skill in split_skills: skill_lists.append("{} {}".format(split_skill, token.text)) return skill_lists return None class SplittingRule(BaseModel): function: Callable[[Doc], Union[List[str], None]] @validator("function") def check_return_type(cls, v): nlp = en_core_web_sm.load() dummy_doc = nlp("This is a dummy sentence.") result = v(dummy_doc) if result is not None: if not isinstance(result, List): raise ValueError( "The custom splitting rule must return None or a list." ) elif not all(isinstance(item, str) for item in result): raise ValueError( "The custom splitting rule must return None or a list of strings." ) return v @Language.factory( "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"] ) def make_coordination_splitter(nlp: Language, name: str): """Make a CoordinationSplitter component. the default splitting rules include: - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present. - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object. - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and') Args: nlp (Language): The spaCy Language object. name (str): The name of the component. RETURNS The CoordinationSplitter component. DOCS: xxx """ return CoordinationSplitter(nlp.vocab, name=name) class CoordinationSplitter(Pipe): def __init__( self, vocab: Vocab, name: str = "coordination_splitter", rules: Optional[List[SplittingRule]] = None, ) -> None: self.name = name self.vocab = vocab if rules is None: default_rules = [ _split_duplicate_object, _split_duplicate_verb, _split_skill_mentions, ] self.rules = [SplittingRule(function=rule) for rule in default_rules] else: # Ensure provided rules are wrapped in SplittingRule instances self.rules = [ rule if isinstance(rule, SplittingRule) else SplittingRule(function=rule) for rule in rules ] def clear_rules(self) -> None: """Clear the default splitting rules.""" self.rules = [] def add_default_rules(self) -> List[SplittingRule]: """Reset the default splitting rules.""" default_rules = [ _split_duplicate_object, _split_duplicate_verb, _split_skill_mentions, ] self.rules = [SplittingRule(function=rule) for rule in default_rules] def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None: """Add a single splitting rule to the default rules.""" validated_rule = SplittingRule(function=rule) self.rules.append(validated_rule) def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None: """Add a list of splitting rules to the default rules. Args: rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules. """ for rule in rules: # Wrap each rule in a SplittingRule instance to ensure it's validated validated_rule = SplittingRule(function=rule) self.rules.append(validated_rule) def __call__(self, doc: Doc) -> Doc: """Apply the splitting rules to the doc. Args: doc (Doc): The spaCy Doc object. Returns: Doc: The modified spaCy Doc object. """ if doc.lang_ != "en": return doc for rule in self.rules: split = rule.function(doc) if split: return Doc(doc.vocab, words=split) return doc