spaCy/spacy/pipeline/coordinationruler.py
2024-02-19 12:39:40 +00:00

322 lines
10 KiB
Python

from typing import List, Callable, Optional, Union
from pydantic import BaseModel, validator
import re
import en_core_web_sm
from ..tokens import Doc
from ..language import Language
from ..vocab import Vocab
from .pipe import Pipe
########### DEFAULT COORDINATION SPLITTING RULES ##############
def _split_duplicate_object(doc: Doc) -> Union[List[str], None]:
"""Split a text with 2 verbs and 1 object (and optionally a subject) into
2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
i.e. 'I use and provide clinical supervision' -->
['I use clinical supervision', 'I provide clinical supervision']
Args:
doc (Doc): The spaCy Doc object.
Returns:
List[str]: The split texts.
"""
sentences = []
for token in doc:
if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"):
has_AND = False
has_second_verb = False
has_dobj = False
subject = None
# Find the subject if it exists
for possible_subject in token.head.children:
if possible_subject.dep_ in ["nsubj", "nsubjpass"]:
subject = possible_subject
break
for child in token.children:
if child.pos_ == "CCONJ" and child.lemma_ == "and":
has_AND = True
if child.pos_ == "VERB" and child.dep_ == "conj":
has_second_verb = True
second_verb = child
first_verb = token.head if token.dep_ == "conj" else token
for descendant in second_verb.subtree:
if descendant.dep_ == "dobj":
has_dobj = True
# Collect the full noun phrase for the direct object
dobj_span = doc[
descendant.left_edge.i : descendant.right_edge.i + 1
]
dobj = dobj_span.text
if has_AND and has_second_verb and has_dobj:
subject_text = subject.text + " " if subject else ""
first_text = "{}{} {}".format(subject_text, first_verb, dobj)
second_text = "{}{} {}".format(subject_text, second_verb, dobj)
sentences.extend([first_text, second_text])
return sentences if sentences else None
def _split_on_and(text: str) -> List[str]:
"""Split a text on 'and' and return a list of the split texts.
Args:
text (str): The text to split.
Returns:
List[str]: The split texts.
"""
text = re.sub(r"\s\s+", " ", text)
replacements = {
";": ",",
", and ,": " and ",
", and,": " and ",
",and ,": " and ",
", and ": " and ",
" and ,": " and ",
",and,": " and ",
" and,": " and ",
",and ": " and ",
}
for old, new in replacements.items():
text = text.replace(old, new)
return [t.strip() for t in re.split(r",| and ", text)]
def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]:
"""Split a text with 1 verb and 2 objects.
i.e. 'I love using smartphones and apps' -->
['I love using smartphones', 'I love using apps']
Args:
doc (Doc): The spaCy Doc object.
Returns:
List[str]: The split texts.
"""
for token in doc:
if token.pos_ == "VERB" and token.dep_ == "ROOT":
has_AND = False
has_dobj = False
has_sec_obj = False
subject = ""
for child in token.children:
if child.dep_ == "dobj":
has_dobj = True
subject = child.text if child.dep_ == "nsubj" else subject
objects = " ".join(
[
c.text
for c in token.subtree
if c.text != token.text and c.dep_ != "nsubj"
]
)
split_objects = _split_on_and(objects)
object_list = []
for split in split_objects:
object_list.append(split)
for subchild in child.children:
if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and":
has_AND = True
if subchild.dep_ == "conj":
has_sec_obj = True
if has_AND and has_dobj and has_sec_obj:
text_list = [
f"{subject} {token.text} {split}.".strip()
for split in object_list
]
return [text.replace(" ..", ".") for text in text_list]
return None
def _split_skill_mentions(doc: Doc) -> Union[List[str], None]:
"""Split a text with 2 skills into 2 texts with 1 skill.
i.e. 'written and oral communication skills' -->
['written communication skills', 'oral communication skills']
Args:
text (str): The text to split.
Returns:
List[str]: The split texts.
"""
for token in doc:
if (
token.pos_ == "NOUN"
and token.lemma_ == "skill"
and token.idx == doc[-1].idx
):
has_AND = False
root = [token for token in doc if token.dep_ == "ROOT"]
if root:
root = root[0]
for child in root.subtree:
if child.pos_ == "CCONJ" and child.lemma_ == "and":
has_AND = True
if has_AND:
skill_def = " ".join(
[c.text for c in root.subtree if c.text != token.text]
)
split_skills = _split_on_and(skill_def)
skill_lists = []
for split_skill in split_skills:
skill_lists.append("{} {}".format(split_skill, token.text))
return skill_lists
return None
class SplittingRule(BaseModel):
function: Callable[[Doc], Union[List[str], None]]
@validator("function")
def check_return_type(cls, v):
nlp = en_core_web_sm.load()
dummy_doc = nlp("This is a dummy sentence.")
result = v(dummy_doc)
if result is not None:
if not isinstance(result, List):
raise ValueError(
"The custom splitting rule must return None or a list."
)
elif not all(isinstance(item, str) for item in result):
raise ValueError(
"The custom splitting rule must return None or a list of strings."
)
return v
@Language.factory(
"coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
)
def make_coordination_splitter(nlp: Language, name: str):
"""Make a CoordinationSplitter component.
the default splitting rules include:
- _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
- _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
- _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
Args:
nlp (Language): The spaCy Language object.
name (str): The name of the component.
RETURNS The CoordinationSplitter component.
DOCS: xxx
"""
return CoordinationSplitter(nlp.vocab, name=name)
class CoordinationSplitter(Pipe):
def __init__(
self,
vocab: Vocab,
name: str = "coordination_splitter",
rules: Optional[List[SplittingRule]] = None,
) -> None:
self.name = name
self.vocab = vocab
if rules is None:
default_rules = [
_split_duplicate_object,
_split_duplicate_verb,
_split_skill_mentions,
]
self.rules = [SplittingRule(function=rule) for rule in default_rules]
else:
# Ensure provided rules are wrapped in SplittingRule instances
self.rules = [
rule
if isinstance(rule, SplittingRule)
else SplittingRule(function=rule)
for rule in rules
]
def clear_rules(self) -> None:
"""Clear the default splitting rules."""
self.rules = []
def add_default_rules(self) -> List[SplittingRule]:
"""Reset the default splitting rules."""
default_rules = [
_split_duplicate_object,
_split_duplicate_verb,
_split_skill_mentions,
]
self.rules = [SplittingRule(function=rule) for rule in default_rules]
def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
"""Add a single splitting rule to the default rules."""
validated_rule = SplittingRule(function=rule)
self.rules.append(validated_rule)
def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
"""Add a list of splitting rules to the default rules.
Args:
rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
"""
for rule in rules:
# Wrap each rule in a SplittingRule instance to ensure it's validated
validated_rule = SplittingRule(function=rule)
self.rules.append(validated_rule)
def __call__(self, doc: Doc) -> Doc:
"""Apply the splitting rules to the doc.
Args:
doc (Doc): The spaCy Doc object.
Returns:
Doc: The modified spaCy Doc object.
"""
if doc.lang_ != "en":
return doc
for rule in self.rules:
split = rule.function(doc)
if split:
return Doc(doc.vocab, words=split)
return doc