mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-04 11:25:51 +03:00
add coordination ruler
This commit is contained in:
parent
bff8725f4b
commit
2c37811c34
|
@ -1,4 +1,5 @@
|
|||
from .attributeruler import AttributeRuler
|
||||
from .coordinationruler import CoordinationSplitter
|
||||
from .dep_parser import DependencyParser
|
||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||
from .entity_linker import EntityLinker
|
||||
|
@ -21,6 +22,7 @@ from .trainable_pipe import TrainablePipe
|
|||
|
||||
__all__ = [
|
||||
"AttributeRuler",
|
||||
"CoordinationSplitter",
|
||||
"DependencyParser",
|
||||
"EditTreeLemmatizer",
|
||||
"EntityLinker",
|
||||
|
|
321
spacy/pipeline/coordinationruler.py
Normal file
321
spacy/pipeline/coordinationruler.py
Normal file
|
@ -0,0 +1,321 @@
|
|||
from typing import List, Callable, Optional, Union
|
||||
from pydantic import BaseModel, validator
|
||||
import re
|
||||
import en_core_web_sm
|
||||
|
||||
from ..tokens import Doc
|
||||
from ..language import Language
|
||||
from ..vocab import Vocab
|
||||
from .pipe import Pipe
|
||||
|
||||
########### DEFAULT COORDINATION SPLITTING RULES ##############
|
||||
|
||||
|
||||
def _split_duplicate_object(doc: Doc) -> Union[List[str], None]:
|
||||
"""Split a text with 2 verbs and 1 object (and optionally a subject) into
|
||||
2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
|
||||
|
||||
i.e. 'I use and provide clinical supervision' -->
|
||||
['I use clinical supervision', 'I provide clinical supervision']
|
||||
|
||||
Args:
|
||||
doc (Doc): The spaCy Doc object.
|
||||
|
||||
Returns:
|
||||
List[str]: The split texts.
|
||||
"""
|
||||
sentences = []
|
||||
|
||||
for token in doc:
|
||||
if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"):
|
||||
|
||||
has_AND = False
|
||||
has_second_verb = False
|
||||
has_dobj = False
|
||||
subject = None
|
||||
|
||||
# Find the subject if it exists
|
||||
for possible_subject in token.head.children:
|
||||
if possible_subject.dep_ in ["nsubj", "nsubjpass"]:
|
||||
subject = possible_subject
|
||||
break
|
||||
|
||||
for child in token.children:
|
||||
|
||||
if child.pos_ == "CCONJ" and child.lemma_ == "and":
|
||||
has_AND = True
|
||||
|
||||
if child.pos_ == "VERB" and child.dep_ == "conj":
|
||||
has_second_verb = True
|
||||
second_verb = child
|
||||
first_verb = token.head if token.dep_ == "conj" else token
|
||||
|
||||
for descendant in second_verb.subtree:
|
||||
if descendant.dep_ == "dobj":
|
||||
has_dobj = True
|
||||
# Collect the full noun phrase for the direct object
|
||||
dobj_span = doc[
|
||||
descendant.left_edge.i : descendant.right_edge.i + 1
|
||||
]
|
||||
dobj = dobj_span.text
|
||||
|
||||
if has_AND and has_second_verb and has_dobj:
|
||||
subject_text = subject.text + " " if subject else ""
|
||||
first_text = "{}{} {}".format(subject_text, first_verb, dobj)
|
||||
second_text = "{}{} {}".format(subject_text, second_verb, dobj)
|
||||
|
||||
sentences.extend([first_text, second_text])
|
||||
|
||||
return sentences if sentences else None
|
||||
|
||||
|
||||
def _split_on_and(text: str) -> List[str]:
|
||||
"""Split a text on 'and' and return a list of the split texts.
|
||||
|
||||
Args:
|
||||
text (str): The text to split.
|
||||
|
||||
Returns:
|
||||
List[str]: The split texts.
|
||||
"""
|
||||
text = re.sub(r"\s\s+", " ", text)
|
||||
|
||||
replacements = {
|
||||
";": ",",
|
||||
", and ,": " and ",
|
||||
", and,": " and ",
|
||||
",and ,": " and ",
|
||||
", and ": " and ",
|
||||
" and ,": " and ",
|
||||
",and,": " and ",
|
||||
" and,": " and ",
|
||||
",and ": " and ",
|
||||
}
|
||||
for old, new in replacements.items():
|
||||
text = text.replace(old, new)
|
||||
|
||||
return [t.strip() for t in re.split(r",| and ", text)]
|
||||
|
||||
|
||||
def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]:
|
||||
"""Split a text with 1 verb and 2 objects.
|
||||
|
||||
i.e. 'I love using smartphones and apps' -->
|
||||
['I love using smartphones', 'I love using apps']
|
||||
|
||||
Args:
|
||||
doc (Doc): The spaCy Doc object.
|
||||
|
||||
Returns:
|
||||
List[str]: The split texts.
|
||||
"""
|
||||
|
||||
for token in doc:
|
||||
|
||||
if token.pos_ == "VERB" and token.dep_ == "ROOT":
|
||||
|
||||
has_AND = False
|
||||
has_dobj = False
|
||||
has_sec_obj = False
|
||||
subject = ""
|
||||
|
||||
for child in token.children:
|
||||
|
||||
if child.dep_ == "dobj":
|
||||
has_dobj = True
|
||||
|
||||
subject = child.text if child.dep_ == "nsubj" else subject
|
||||
|
||||
objects = " ".join(
|
||||
[
|
||||
c.text
|
||||
for c in token.subtree
|
||||
if c.text != token.text and c.dep_ != "nsubj"
|
||||
]
|
||||
)
|
||||
|
||||
split_objects = _split_on_and(objects)
|
||||
|
||||
object_list = []
|
||||
for split in split_objects:
|
||||
object_list.append(split)
|
||||
|
||||
for subchild in child.children:
|
||||
|
||||
if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and":
|
||||
has_AND = True
|
||||
|
||||
if subchild.dep_ == "conj":
|
||||
has_sec_obj = True
|
||||
|
||||
if has_AND and has_dobj and has_sec_obj:
|
||||
text_list = [
|
||||
f"{subject} {token.text} {split}.".strip()
|
||||
for split in object_list
|
||||
]
|
||||
return [text.replace(" ..", ".") for text in text_list]
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _split_skill_mentions(doc: Doc) -> Union[List[str], None]:
|
||||
"""Split a text with 2 skills into 2 texts with 1 skill.
|
||||
|
||||
i.e. 'written and oral communication skills' -->
|
||||
['written communication skills', 'oral communication skills']
|
||||
|
||||
Args:
|
||||
text (str): The text to split.
|
||||
|
||||
Returns:
|
||||
List[str]: The split texts.
|
||||
"""
|
||||
for token in doc:
|
||||
if (
|
||||
token.pos_ == "NOUN"
|
||||
and token.lemma_ == "skill"
|
||||
and token.idx == doc[-1].idx
|
||||
):
|
||||
|
||||
has_AND = False
|
||||
|
||||
root = [token for token in doc if token.dep_ == "ROOT"]
|
||||
if root:
|
||||
root = root[0]
|
||||
|
||||
for child in root.subtree:
|
||||
|
||||
if child.pos_ == "CCONJ" and child.lemma_ == "and":
|
||||
has_AND = True
|
||||
|
||||
if has_AND:
|
||||
skill_def = " ".join(
|
||||
[c.text for c in root.subtree if c.text != token.text]
|
||||
)
|
||||
|
||||
split_skills = _split_on_and(skill_def)
|
||||
|
||||
skill_lists = []
|
||||
for split_skill in split_skills:
|
||||
skill_lists.append("{} {}".format(split_skill, token.text))
|
||||
|
||||
return skill_lists
|
||||
return None
|
||||
|
||||
|
||||
class SplittingRule(BaseModel):
|
||||
function: Callable[[Doc], Union[List[str], None]]
|
||||
|
||||
@validator("function")
|
||||
def check_return_type(cls, v):
|
||||
nlp = en_core_web_sm.load()
|
||||
dummy_doc = nlp("This is a dummy sentence.")
|
||||
result = v(dummy_doc)
|
||||
if result is not None:
|
||||
if not isinstance(result, List):
|
||||
raise ValueError(
|
||||
"The custom splitting rule must return None or a list."
|
||||
)
|
||||
elif not all(isinstance(item, str) for item in result):
|
||||
raise ValueError(
|
||||
"The custom splitting rule must return None or a list of strings."
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
@Language.factory(
|
||||
"coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
|
||||
)
|
||||
def make_coordination_splitter(nlp: Language, name: str):
|
||||
"""Make a CoordinationSplitter component.
|
||||
|
||||
the default splitting rules include:
|
||||
|
||||
- _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
|
||||
- _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
|
||||
- _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
|
||||
|
||||
|
||||
Args:
|
||||
nlp (Language): The spaCy Language object.
|
||||
name (str): The name of the component.
|
||||
|
||||
RETURNS The CoordinationSplitter component.
|
||||
|
||||
DOCS: xxx
|
||||
"""
|
||||
|
||||
return CoordinationSplitter(nlp.vocab, name=name)
|
||||
|
||||
|
||||
class CoordinationSplitter(Pipe):
|
||||
def __init__(
|
||||
self,
|
||||
vocab: Vocab,
|
||||
name: str = "coordination_splitter",
|
||||
rules: Optional[List[SplittingRule]] = None,
|
||||
) -> None:
|
||||
self.name = name
|
||||
self.vocab = vocab
|
||||
if rules is None:
|
||||
default_rules = [
|
||||
_split_duplicate_object,
|
||||
_split_duplicate_verb,
|
||||
_split_skill_mentions,
|
||||
]
|
||||
self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
||||
else:
|
||||
# Ensure provided rules are wrapped in SplittingRule instances
|
||||
self.rules = [
|
||||
rule
|
||||
if isinstance(rule, SplittingRule)
|
||||
else SplittingRule(function=rule)
|
||||
for rule in rules
|
||||
]
|
||||
|
||||
def clear_rules(self) -> None:
|
||||
"""Clear the default splitting rules."""
|
||||
self.rules = []
|
||||
|
||||
def add_default_rules(self) -> List[SplittingRule]:
|
||||
"""Reset the default splitting rules."""
|
||||
default_rules = [
|
||||
_split_duplicate_object,
|
||||
_split_duplicate_verb,
|
||||
_split_skill_mentions,
|
||||
]
|
||||
self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
||||
|
||||
def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
|
||||
"""Add a single splitting rule to the default rules."""
|
||||
validated_rule = SplittingRule(function=rule)
|
||||
self.rules.append(validated_rule)
|
||||
|
||||
def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
|
||||
"""Add a list of splitting rules to the default rules.
|
||||
|
||||
Args:
|
||||
rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
|
||||
"""
|
||||
for rule in rules:
|
||||
# Wrap each rule in a SplittingRule instance to ensure it's validated
|
||||
validated_rule = SplittingRule(function=rule)
|
||||
self.rules.append(validated_rule)
|
||||
|
||||
def __call__(self, doc: Doc) -> Doc:
|
||||
"""Apply the splitting rules to the doc.
|
||||
|
||||
Args:
|
||||
doc (Doc): The spaCy Doc object.
|
||||
|
||||
Returns:
|
||||
Doc: The modified spaCy Doc object.
|
||||
"""
|
||||
if doc.lang_ != "en":
|
||||
return doc
|
||||
|
||||
for rule in self.rules:
|
||||
split = rule.function(doc)
|
||||
if split:
|
||||
return Doc(doc.vocab, words=split)
|
||||
return doc
|
66
spacy/tests/pipeline/test_coordinationruler.py
Normal file
66
spacy/tests/pipeline/test_coordinationruler.py
Normal file
|
@ -0,0 +1,66 @@
|
|||
import pytest
|
||||
from typing import List
|
||||
from spacy.tokens import Doc
|
||||
|
||||
import en_core_web_sm
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def nlp():
|
||||
return en_core_web_sm.load()
|
||||
|
||||
|
||||
def _my_custom_splitting_rule(doc: Doc) -> List[str]:
|
||||
split_phrases = []
|
||||
for token in doc:
|
||||
if token.text == "read":
|
||||
split_phrases.append("test1")
|
||||
split_phrases.append("test2")
|
||||
return split_phrases
|
||||
|
||||
|
||||
def test_coordinationruler(nlp):
|
||||
doc = nlp("I read and write books")
|
||||
assert len(doc) == 5
|
||||
assert [d.text for d in doc] == ["I", "read", "and", "write", "books"]
|
||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
||||
assert len(coord_splitter.rules) == 3
|
||||
assert coord_splitter.name == "coordination_splitter"
|
||||
doc_split = coord_splitter(doc)
|
||||
assert len(doc_split) == 2
|
||||
assert [t.text for t in doc_split] == ["I read books", "I write books"]
|
||||
|
||||
|
||||
def test_coordinationruler_clear_rules(nlp):
|
||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
||||
assert len(coord_splitter.rules) == 3
|
||||
coord_splitter.clear_rules()
|
||||
assert len(coord_splitter.rules) == 0
|
||||
assert coord_splitter.rules == []
|
||||
|
||||
|
||||
def test_coordinationruler_add_rule(nlp):
|
||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
||||
assert len(coord_splitter.rules) == 3
|
||||
coord_splitter.add_rule(_my_custom_splitting_rule)
|
||||
assert len(coord_splitter.rules) == 4
|
||||
|
||||
|
||||
def test_coordinationruler_add_rules(nlp):
|
||||
doc = nlp("I read and write books")
|
||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
||||
coord_splitter.clear_rules()
|
||||
coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule])
|
||||
assert len(coord_splitter.rules) == 2
|
||||
doc_split = coord_splitter(doc)
|
||||
assert len(doc_split) == 2
|
||||
|
||||
assert [t.text for t in doc_split] == ["test1", "test2"]
|
||||
|
||||
|
||||
def test_coordinationruler_add_default_rules(nlp):
|
||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
||||
coord_splitter.clear_rules()
|
||||
assert len(coord_splitter.rules) == 0
|
||||
coord_splitter.add_default_rules()
|
||||
assert len(coord_splitter.rules) == 3
|
Loading…
Reference in New Issue
Block a user