add usecase

This commit is contained in:
India Kerle 2024-02-29 14:45:07 -03:00
parent d66a616f31
commit 81c52c8ff2
3 changed files with 314 additions and 355 deletions

View File

@ -1,5 +1,5 @@
from .attributeruler import AttributeRuler from .attributeruler import AttributeRuler
from .coordinationruler import CoordinationSplitter #from .coordinationruler import CoordinationSplitter
from .dep_parser import DependencyParser from .dep_parser import DependencyParser
from .edit_tree_lemmatizer import EditTreeLemmatizer from .edit_tree_lemmatizer import EditTreeLemmatizer
from .entity_linker import EntityLinker from .entity_linker import EntityLinker
@ -22,7 +22,7 @@ from .trainable_pipe import TrainablePipe
__all__ = [ __all__ = [
"AttributeRuler", "AttributeRuler",
"CoordinationSplitter", #"CoordinationSplitter",
"DependencyParser", "DependencyParser",
"EditTreeLemmatizer", "EditTreeLemmatizer",
"EntityLinker", "EntityLinker",

View File

@ -1,7 +1,6 @@
from typing import List, Callable, Optional, Union from typing import List, Callable, Optional, Union
from pydantic import BaseModel, validator from pydantic import BaseModel, validator
import re import re
import en_core_web_sm
from ..tokens import Doc from ..tokens import Doc
from ..language import Language from ..language import Language
@ -10,312 +9,179 @@ from .pipe import Pipe
########### DEFAULT COORDINATION SPLITTING RULES ############## ########### DEFAULT COORDINATION SPLITTING RULES ##############
def split_noun_coordination(doc: Doc) -> Union[List[str], None]:
"""Identifies and splits phrases with multiple nouns, a modifier
and a conjunction.
def _split_duplicate_object(doc: Doc) -> Union[List[str], None]: Examples:
"""Split a text with 2 verbs and 1 object (and optionally a subject) into - "apples and oranges" -> None
2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present. - "green apples and oranges" -> ["green apples", "green oranges"]
- "green apples and rotten oranges" -> None
i.e. 'I use and provide clinical supervision' --> - "apples and juicy oranges" -> ["juicy apples", "juicy oranges"]
['I use clinical supervision', 'I provide clinical supervision'] - "hot chicken wings and soup" -> ["hot chicken wings", "hot soup"]
- "spicy ice cream and chicken wings" -> ["spicy ice cream", "spicy chicken wings"]
Args: Args:
doc (Doc): The spaCy Doc object. doc (Doc): The input document.
Returns: Returns:
List[str]: The split texts. Union[List[str], None]: A list of the coordinated noun phrases,
or None if no coordinated noun phrases are found.
""" """
sentences = [] def _split_doc(doc: Doc) -> bool:
noun_modified = False
for token in doc: has_conjunction = False
if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"):
for token in doc:
has_AND = False if token.head.pos_ == 'NOUN': ## check to see that the phrase is a noun phrase
has_second_verb = False has_modifier = any(child.dep_ == 'amod' for child in token.head.children) #check to see if the noun has a modifier
has_dobj = False if has_modifier:
subject = None noun_modified = True
# check if there is a conjunction linked directly to a noun
# Find the subject if it exists if token.dep_ == 'conj' and token.head.pos_ == 'NOUN':
for possible_subject in token.head.children: has_conjunction = True
if possible_subject.dep_ in ["nsubj", "nsubjpass"]:
subject = possible_subject return True if noun_modified and has_conjunction else False
break
phrases = []
for child in token.children: modified_nouns = set()
to_split = _split_doc(doc)
if child.pos_ == "CCONJ" and child.lemma_ == "and":
has_AND = True if to_split:
for token in doc:
if child.pos_ == "VERB" and child.dep_ == "conj": if token.dep_ == "amod" and token.head.pos_ == "NOUN":
has_second_verb = True modifier = token.text
second_verb = child head_noun = token.head
first_verb = token.head if token.dep_ == "conj" else token
if head_noun not in modified_nouns:
for descendant in second_verb.subtree: nouns_to_modify = [head_noun] + list(head_noun.conjuncts)
if descendant.dep_ == "dobj":
has_dobj = True for noun in nouns_to_modify:
# Collect the full noun phrase for the direct object compound_parts = [child.text for child in noun.lefts if child.dep_ == "compound"]
dobj_span = doc[ complete_noun_phrase = " ".join(compound_parts + [noun.text])
descendant.left_edge.i : descendant.right_edge.i + 1 phrases.append(f"{modifier} {complete_noun_phrase}")
] modified_nouns.add(noun) # Mark this noun as modified
dobj = dobj_span.text
return phrases if phrases != [] else None
if has_AND and has_second_verb and has_dobj: else:
subject_text = subject.text + " " if subject else "" return None
first_text = "{}{} {}".format(subject_text, first_verb, dobj)
second_text = "{}{} {}".format(subject_text, second_verb, dobj)
###############################################################
sentences.extend([first_text, second_text])
# class SplittingRule(BaseModel):
return sentences if sentences else None # function: Callable[[Doc], Union[List[str], None]]
# @validator("function")
def _split_on_and(text: str) -> List[str]: # def check_return_type(cls, v):
"""Split a text on 'and' and return a list of the split texts. # nlp = en_core_web_sm.load()
# dummy_doc = nlp("This is a dummy sentence.")
Args: # result = v(dummy_doc)
text (str): The text to split. # if result is not None:
# if not isinstance(result, List):
Returns: # raise ValueError(
List[str]: The split texts. # "The custom splitting rule must return None or a list."
""" # )
text = re.sub(r"\s\s+", " ", text) # elif not all(isinstance(item, str) for item in result):
# raise ValueError(
replacements = { # "The custom splitting rule must return None or a list of strings."
";": ",", # )
", and ,": " and ", # return v
", and,": " and ",
",and ,": " and ",
", and ": " and ", # @Language.factory(
" and ,": " and ", # "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
",and,": " and ", # )
" and,": " and ", # def make_coordination_splitter(nlp: Language, name: str):
",and ": " and ", # """Make a CoordinationSplitter component.
}
for old, new in replacements.items(): # the default splitting rules include:
text = text.replace(old, new)
# - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
return [t.strip() for t in re.split(r",| and ", text)] # - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
# - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]:
"""Split a text with 1 verb and 2 objects. # Args:
# nlp (Language): The spaCy Language object.
i.e. 'I love using smartphones and apps' --> # name (str): The name of the component.
['I love using smartphones', 'I love using apps']
# RETURNS The CoordinationSplitter component.
Args:
doc (Doc): The spaCy Doc object. # DOCS: xxx
# """
Returns:
List[str]: The split texts. # return CoordinationSplitter(nlp.vocab, name=name)
"""
for token in doc: # class CoordinationSplitter(Pipe):
# def __init__(
if token.pos_ == "VERB" and token.dep_ == "ROOT": # self,
# vocab: Vocab,
has_AND = False # name: str = "coordination_splitter",
has_dobj = False # rules: Optional[List[SplittingRule]] = None,
has_sec_obj = False # ) -> None:
subject = "" # self.name = name
# self.vocab = vocab
for child in token.children: # if rules is None:
# default_rules = [
if child.dep_ == "dobj": # _split_duplicate_object,
has_dobj = True # _split_duplicate_verb,
# _split_skill_mentions,
subject = child.text if child.dep_ == "nsubj" else subject # ]
# self.rules = [SplittingRule(function=rule) for rule in default_rules]
objects = " ".join( # else:
[ # # Ensure provided rules are wrapped in SplittingRule instances
c.text # self.rules = [
for c in token.subtree # rule
if c.text != token.text and c.dep_ != "nsubj" # if isinstance(rule, SplittingRule)
] # else SplittingRule(function=rule)
) # for rule in rules
# ]
split_objects = _split_on_and(objects)
# def clear_rules(self) -> None:
object_list = [] # """Clear the default splitting rules."""
for split in split_objects: # self.rules = []
object_list.append(split)
# def add_default_rules(self) -> List[SplittingRule]:
for subchild in child.children: # """Reset the default splitting rules."""
# default_rules = [
if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and": # _split_duplicate_object,
has_AND = True # _split_duplicate_verb,
# _split_skill_mentions,
if subchild.dep_ == "conj": # ]
has_sec_obj = True # self.rules = [SplittingRule(function=rule) for rule in default_rules]
if has_AND and has_dobj and has_sec_obj: # def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
text_list = [ # """Add a single splitting rule to the default rules."""
f"{subject} {token.text} {split}.".strip() # validated_rule = SplittingRule(function=rule)
for split in object_list # self.rules.append(validated_rule)
]
return [text.replace(" ..", ".") for text in text_list] # def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
# """Add a list of splitting rules to the default rules.
return None
# Args:
# rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
def _split_skill_mentions(doc: Doc) -> Union[List[str], None]: # """
"""Split a text with 2 skills into 2 texts with 1 skill. # for rule in rules:
# # Wrap each rule in a SplittingRule instance to ensure it's validated
i.e. 'written and oral communication skills' --> # validated_rule = SplittingRule(function=rule)
['written communication skills', 'oral communication skills'] # self.rules.append(validated_rule)
Args: # def __call__(self, doc: Doc) -> Doc:
text (str): The text to split. # """Apply the splitting rules to the doc.
Returns: # Args:
List[str]: The split texts. # doc (Doc): The spaCy Doc object.
"""
for token in doc: # Returns:
if ( # Doc: The modified spaCy Doc object.
token.pos_ == "NOUN" # """
and token.lemma_ == "skill" # if doc.lang_ != "en":
and token.idx == doc[-1].idx # return doc
):
# for rule in self.rules:
has_AND = False # split = rule.function(doc)
# if split:
root = [token for token in doc if token.dep_ == "ROOT"] # return Doc(doc.vocab, words=split)
if root: # return doc
root = root[0]
for child in root.subtree:
if child.pos_ == "CCONJ" and child.lemma_ == "and":
has_AND = True
if has_AND:
skill_def = " ".join(
[c.text for c in root.subtree if c.text != token.text]
)
split_skills = _split_on_and(skill_def)
skill_lists = []
for split_skill in split_skills:
skill_lists.append("{} {}".format(split_skill, token.text))
return skill_lists
return None
class SplittingRule(BaseModel):
function: Callable[[Doc], Union[List[str], None]]
@validator("function")
def check_return_type(cls, v):
nlp = en_core_web_sm.load()
dummy_doc = nlp("This is a dummy sentence.")
result = v(dummy_doc)
if result is not None:
if not isinstance(result, List):
raise ValueError(
"The custom splitting rule must return None or a list."
)
elif not all(isinstance(item, str) for item in result):
raise ValueError(
"The custom splitting rule must return None or a list of strings."
)
return v
@Language.factory(
"coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
)
def make_coordination_splitter(nlp: Language, name: str):
"""Make a CoordinationSplitter component.
the default splitting rules include:
- _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
- _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
- _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
Args:
nlp (Language): The spaCy Language object.
name (str): The name of the component.
RETURNS The CoordinationSplitter component.
DOCS: xxx
"""
return CoordinationSplitter(nlp.vocab, name=name)
class CoordinationSplitter(Pipe):
def __init__(
self,
vocab: Vocab,
name: str = "coordination_splitter",
rules: Optional[List[SplittingRule]] = None,
) -> None:
self.name = name
self.vocab = vocab
if rules is None:
default_rules = [
_split_duplicate_object,
_split_duplicate_verb,
_split_skill_mentions,
]
self.rules = [SplittingRule(function=rule) for rule in default_rules]
else:
# Ensure provided rules are wrapped in SplittingRule instances
self.rules = [
rule
if isinstance(rule, SplittingRule)
else SplittingRule(function=rule)
for rule in rules
]
def clear_rules(self) -> None:
"""Clear the default splitting rules."""
self.rules = []
def add_default_rules(self) -> List[SplittingRule]:
"""Reset the default splitting rules."""
default_rules = [
_split_duplicate_object,
_split_duplicate_verb,
_split_skill_mentions,
]
self.rules = [SplittingRule(function=rule) for rule in default_rules]
def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
"""Add a single splitting rule to the default rules."""
validated_rule = SplittingRule(function=rule)
self.rules.append(validated_rule)
def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
"""Add a list of splitting rules to the default rules.
Args:
rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
"""
for rule in rules:
# Wrap each rule in a SplittingRule instance to ensure it's validated
validated_rule = SplittingRule(function=rule)
self.rules.append(validated_rule)
def __call__(self, doc: Doc) -> Doc:
"""Apply the splitting rules to the doc.
Args:
doc (Doc): The spaCy Doc object.
Returns:
Doc: The modified spaCy Doc object.
"""
if doc.lang_ != "en":
return doc
for rule in self.rules:
split = rule.function(doc)
if split:
return Doc(doc.vocab, words=split)
return doc

View File

@ -1,66 +1,159 @@
import pytest import pytest
from typing import List from typing import List
from spacy.tokens import Doc from spacy.tokens import Doc
import spacy
import en_core_web_sm from spacy.pipeline.coordinationruler import split_noun_coordination
@pytest.fixture @pytest.fixture
def nlp(): def nlp():
return en_core_web_sm.load() return spacy.blank("en")
### NOUN CONSTRUCTION CASES ###
@pytest.fixture
def noun_construction_case1(nlp):
words = ["apples", "and", "oranges"]
spaces = [True, True, False] # Indicates whether the word is followed by a space
pos_tags = ["NOUN", "CCONJ", "NOUN"]
dep_relations = ["nsubj", "cc", "conj"]
def _my_custom_splitting_rule(doc: Doc) -> List[str]: doc = Doc(nlp.vocab, words=words, spaces=spaces)
split_phrases = []
for token in doc:
if token.text == "read":
split_phrases.append("test1")
split_phrases.append("test2")
return split_phrases
#set pos_ and dep_ attributes
for token, pos, dep in zip(doc, pos_tags, dep_relations):
token.pos_ = pos
token.dep_ = dep
def test_coordinationruler(nlp): # # define head relationships manually
doc = nlp("I read and write books") doc[1].head = doc[2] # "and" -> "oranges"
assert len(doc) == 5 doc[2].head = doc[0] # "oranges" -> "apples"
assert [d.text for d in doc] == ["I", "read", "and", "write", "books"] doc[0].head = doc[0]
coord_splitter = nlp.add_pipe("coordination_splitter")
assert len(coord_splitter.rules) == 3
assert coord_splitter.name == "coordination_splitter"
doc_split = coord_splitter(doc)
assert len(doc_split) == 2
assert [t.text for t in doc_split] == ["I read books", "I write books"]
return doc
def test_coordinationruler_clear_rules(nlp): @pytest.fixture
coord_splitter = nlp.add_pipe("coordination_splitter") def noun_construction_case2(nlp):
assert len(coord_splitter.rules) == 3 words = ["red", "apples", "and", "oranges"]
coord_splitter.clear_rules() spaces = [True, True, True, False] # Indicates whether the word is followed by a space
assert len(coord_splitter.rules) == 0 pos_tags = ["ADJ", "NOUN", "CCONJ", "NOUN"]
assert coord_splitter.rules == [] dep_relations = ["amod", "nsubj", "cc", "conj"]
# Create a Doc object manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
def test_coordinationruler_add_rule(nlp): #set pos_ and dep_ attributes
coord_splitter = nlp.add_pipe("coordination_splitter") for token, pos, dep in zip(doc, pos_tags, dep_relations):
assert len(coord_splitter.rules) == 3 token.pos_ = pos
coord_splitter.add_rule(_my_custom_splitting_rule) token.dep_ = dep
assert len(coord_splitter.rules) == 4
# define head relationships manually
doc[0].head = doc[1]
doc[2].head = doc[3]
doc[3].head = doc[1]
def test_coordinationruler_add_rules(nlp): return doc
doc = nlp("I read and write books")
coord_splitter = nlp.add_pipe("coordination_splitter")
coord_splitter.clear_rules()
coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule])
assert len(coord_splitter.rules) == 2
doc_split = coord_splitter(doc)
assert len(doc_split) == 2
assert [t.text for t in doc_split] == ["test1", "test2"] @pytest.fixture
def noun_construction_case3(nlp):
words = ["apples", "and", "juicy", "oranges"]
spaces = [True, True, True, False] # Indicates whether the word is followed by a space.
pos_tags = ["NOUN", "CCONJ", "ADJ", "NOUN"]
dep_relations = ["nsubj", "cc", "amod", "conj"]
#create a Doc object manually
doc = Doc(nlp.vocab, words=words, spaces=spaces)
def test_coordinationruler_add_default_rules(nlp): #set POS and dependency tags
coord_splitter = nlp.add_pipe("coordination_splitter") for token, pos, dep in zip(doc, pos_tags, dep_relations):
coord_splitter.clear_rules() token.pos_ = pos
assert len(coord_splitter.rules) == 0 token.dep_ = dep
coord_splitter.add_default_rules()
assert len(coord_splitter.rules) == 3 #defining head relationships manually
doc[0].head = doc[0] # "apples" as root, pointing to itself for simplicity.
doc[1].head = doc[3] # "and" -> "oranges"
doc[2].head = doc[3] # "juicy" -> "oranges"
doc[3].head = doc[0] # "oranges" -> "apples", indicating a conjunctive relationship
return doc
@pytest.fixture
def noun_construction_case4(nlp):
words = ["hot", "chicken", "wings", "and", "soup"]
spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space.
pos_tags= ["ADJ", "NOUN", "NOUN", "CCONJ", "NOUN"]
dep_relations = ["amod", "compound", "ROOT", "cc", "conj"]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
for token, pos, dep in zip(doc, pos_tags, dep_relations):
token.pos_ = pos
token.dep_ = dep
# Define head relationships manually for "hot chicken wings and soup".
doc[0].head = doc[2] # "hot" -> "wings"
doc[1].head = doc[2] # "chicken" -> "wings"
doc[2].head = doc[2] # "wings" as root
doc[3].head = doc[4] # "and" -> "soup"
doc[4].head = doc[2] # "soup" -> "wings"
return doc
@pytest.fixture
def noun_construction_case5(nlp):
words = ["green", "apples", "and", "rotten", "oranges"]
spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space.
pos_tags = ["ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"]
dep_relations = ["amod", "ROOT", "cc", "amod", "conj"]
doc = Doc(nlp.vocab, words=words, spaces=spaces)
# Set POS and dependency tags.
for token, pos, dep in zip(doc, pos_tags, dep_relations):
token.pos_ = pos
token.dep_ = dep
# Define head relationships manually for "green apples and rotten oranges".
doc[0].head = doc[1] # "green" -> "apples"
doc[1].head = doc[1] # "apples" as root
doc[2].head = doc[4] # "and" -> "oranges"
doc[3].head = doc[4] # "rotten" -> "oranges"
doc[4].head = doc[1] # "oranges" -> "apples"
return doc
#test split_noun_coordination on 5 different cases
def test_split_noun_coordination(noun_construction_case1,
noun_construction_case2,
noun_construction_case3,
noun_construction_case4,
noun_construction_case5):
#test 1: no modifier - it should return None from _split_doc
case1_split = split_noun_coordination(noun_construction_case1)
assert case1_split == None
#test 2: modifier is at the beginning of the noun phrase
case2_split = split_noun_coordination(noun_construction_case2)
assert len(case2_split) == 2
assert isinstance(case2_split, list)
assert all(isinstance(phrase, str) for phrase in case2_split)
assert case2_split == ["red apples", "red oranges"]
#test 3: modifier is at the end of the noun phrase
case3_split = split_noun_coordination(noun_construction_case3)
assert len(case3_split) == 2
assert isinstance(case3_split, list)
assert all(isinstance(phrase, str) for phrase in case3_split)
assert case3_split == ["juicy apples", "juicy oranges"]
#test 4: deal with compound nouns
case4_split = split_noun_coordination(noun_construction_case4)
assert len(case4_split) == 2
assert isinstance(case4_split, list)
assert all(isinstance(phrase, str) for phrase in case4_split)
assert case4_split == ["hot chicken wings", "hot soup"]
#test 5: multiple modifiers
case5_split = split_noun_coordination(noun_construction_case5)
assert case5_split == None