mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-04 11:25:51 +03:00
add usecase
This commit is contained in:
parent
d66a616f31
commit
81c52c8ff2
|
@ -1,5 +1,5 @@
|
||||||
from .attributeruler import AttributeRuler
|
from .attributeruler import AttributeRuler
|
||||||
from .coordinationruler import CoordinationSplitter
|
#from .coordinationruler import CoordinationSplitter
|
||||||
from .dep_parser import DependencyParser
|
from .dep_parser import DependencyParser
|
||||||
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
from .edit_tree_lemmatizer import EditTreeLemmatizer
|
||||||
from .entity_linker import EntityLinker
|
from .entity_linker import EntityLinker
|
||||||
|
@ -22,7 +22,7 @@ from .trainable_pipe import TrainablePipe
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AttributeRuler",
|
"AttributeRuler",
|
||||||
"CoordinationSplitter",
|
#"CoordinationSplitter",
|
||||||
"DependencyParser",
|
"DependencyParser",
|
||||||
"EditTreeLemmatizer",
|
"EditTreeLemmatizer",
|
||||||
"EntityLinker",
|
"EntityLinker",
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
from typing import List, Callable, Optional, Union
|
from typing import List, Callable, Optional, Union
|
||||||
from pydantic import BaseModel, validator
|
from pydantic import BaseModel, validator
|
||||||
import re
|
import re
|
||||||
import en_core_web_sm
|
|
||||||
|
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from ..language import Language
|
from ..language import Language
|
||||||
|
@ -9,313 +8,180 @@ from ..vocab import Vocab
|
||||||
from .pipe import Pipe
|
from .pipe import Pipe
|
||||||
|
|
||||||
########### DEFAULT COORDINATION SPLITTING RULES ##############
|
########### DEFAULT COORDINATION SPLITTING RULES ##############
|
||||||
|
|
||||||
|
def split_noun_coordination(doc: Doc) -> Union[List[str], None]:
|
||||||
def _split_duplicate_object(doc: Doc) -> Union[List[str], None]:
|
"""Identifies and splits phrases with multiple nouns, a modifier
|
||||||
"""Split a text with 2 verbs and 1 object (and optionally a subject) into
|
and a conjunction.
|
||||||
2 texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
|
|
||||||
|
Examples:
|
||||||
i.e. 'I use and provide clinical supervision' -->
|
- "apples and oranges" -> None
|
||||||
['I use clinical supervision', 'I provide clinical supervision']
|
- "green apples and oranges" -> ["green apples", "green oranges"]
|
||||||
|
- "green apples and rotten oranges" -> None
|
||||||
|
- "apples and juicy oranges" -> ["juicy apples", "juicy oranges"]
|
||||||
|
- "hot chicken wings and soup" -> ["hot chicken wings", "hot soup"]
|
||||||
|
- "spicy ice cream and chicken wings" -> ["spicy ice cream", "spicy chicken wings"]
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
doc (Doc): The spaCy Doc object.
|
doc (Doc): The input document.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[str]: The split texts.
|
Union[List[str], None]: A list of the coordinated noun phrases,
|
||||||
|
or None if no coordinated noun phrases are found.
|
||||||
"""
|
"""
|
||||||
sentences = []
|
def _split_doc(doc: Doc) -> bool:
|
||||||
|
noun_modified = False
|
||||||
for token in doc:
|
has_conjunction = False
|
||||||
if token.pos_ == "VERB" and (token.dep_ == "ROOT" or token.dep_ == "conj"):
|
|
||||||
|
for token in doc:
|
||||||
has_AND = False
|
if token.head.pos_ == 'NOUN': ## check to see that the phrase is a noun phrase
|
||||||
has_second_verb = False
|
has_modifier = any(child.dep_ == 'amod' for child in token.head.children) #check to see if the noun has a modifier
|
||||||
has_dobj = False
|
if has_modifier:
|
||||||
subject = None
|
noun_modified = True
|
||||||
|
# check if there is a conjunction linked directly to a noun
|
||||||
# Find the subject if it exists
|
if token.dep_ == 'conj' and token.head.pos_ == 'NOUN':
|
||||||
for possible_subject in token.head.children:
|
has_conjunction = True
|
||||||
if possible_subject.dep_ in ["nsubj", "nsubjpass"]:
|
|
||||||
subject = possible_subject
|
return True if noun_modified and has_conjunction else False
|
||||||
break
|
|
||||||
|
phrases = []
|
||||||
for child in token.children:
|
modified_nouns = set()
|
||||||
|
to_split = _split_doc(doc)
|
||||||
if child.pos_ == "CCONJ" and child.lemma_ == "and":
|
|
||||||
has_AND = True
|
if to_split:
|
||||||
|
for token in doc:
|
||||||
if child.pos_ == "VERB" and child.dep_ == "conj":
|
if token.dep_ == "amod" and token.head.pos_ == "NOUN":
|
||||||
has_second_verb = True
|
modifier = token.text
|
||||||
second_verb = child
|
head_noun = token.head
|
||||||
first_verb = token.head if token.dep_ == "conj" else token
|
|
||||||
|
if head_noun not in modified_nouns:
|
||||||
for descendant in second_verb.subtree:
|
nouns_to_modify = [head_noun] + list(head_noun.conjuncts)
|
||||||
if descendant.dep_ == "dobj":
|
|
||||||
has_dobj = True
|
for noun in nouns_to_modify:
|
||||||
# Collect the full noun phrase for the direct object
|
compound_parts = [child.text for child in noun.lefts if child.dep_ == "compound"]
|
||||||
dobj_span = doc[
|
complete_noun_phrase = " ".join(compound_parts + [noun.text])
|
||||||
descendant.left_edge.i : descendant.right_edge.i + 1
|
phrases.append(f"{modifier} {complete_noun_phrase}")
|
||||||
]
|
modified_nouns.add(noun) # Mark this noun as modified
|
||||||
dobj = dobj_span.text
|
|
||||||
|
return phrases if phrases != [] else None
|
||||||
if has_AND and has_second_verb and has_dobj:
|
else:
|
||||||
subject_text = subject.text + " " if subject else ""
|
return None
|
||||||
first_text = "{}{} {}".format(subject_text, first_verb, dobj)
|
|
||||||
second_text = "{}{} {}".format(subject_text, second_verb, dobj)
|
|
||||||
|
###############################################################
|
||||||
sentences.extend([first_text, second_text])
|
|
||||||
|
# class SplittingRule(BaseModel):
|
||||||
return sentences if sentences else None
|
# function: Callable[[Doc], Union[List[str], None]]
|
||||||
|
|
||||||
|
# @validator("function")
|
||||||
def _split_on_and(text: str) -> List[str]:
|
# def check_return_type(cls, v):
|
||||||
"""Split a text on 'and' and return a list of the split texts.
|
# nlp = en_core_web_sm.load()
|
||||||
|
# dummy_doc = nlp("This is a dummy sentence.")
|
||||||
Args:
|
# result = v(dummy_doc)
|
||||||
text (str): The text to split.
|
# if result is not None:
|
||||||
|
# if not isinstance(result, List):
|
||||||
Returns:
|
# raise ValueError(
|
||||||
List[str]: The split texts.
|
# "The custom splitting rule must return None or a list."
|
||||||
"""
|
# )
|
||||||
text = re.sub(r"\s\s+", " ", text)
|
# elif not all(isinstance(item, str) for item in result):
|
||||||
|
# raise ValueError(
|
||||||
replacements = {
|
# "The custom splitting rule must return None or a list of strings."
|
||||||
";": ",",
|
# )
|
||||||
", and ,": " and ",
|
# return v
|
||||||
", and,": " and ",
|
|
||||||
",and ,": " and ",
|
|
||||||
", and ": " and ",
|
# @Language.factory(
|
||||||
" and ,": " and ",
|
# "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
|
||||||
",and,": " and ",
|
# )
|
||||||
" and,": " and ",
|
# def make_coordination_splitter(nlp: Language, name: str):
|
||||||
",and ": " and ",
|
# """Make a CoordinationSplitter component.
|
||||||
}
|
|
||||||
for old, new in replacements.items():
|
# the default splitting rules include:
|
||||||
text = text.replace(old, new)
|
|
||||||
|
# - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
|
||||||
return [t.strip() for t in re.split(r",| and ", text)]
|
# - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
|
||||||
|
# - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
|
||||||
|
|
||||||
def _split_duplicate_verb(doc: Doc) -> Union[List[str], None]:
|
|
||||||
"""Split a text with 1 verb and 2 objects.
|
# Args:
|
||||||
|
# nlp (Language): The spaCy Language object.
|
||||||
i.e. 'I love using smartphones and apps' -->
|
# name (str): The name of the component.
|
||||||
['I love using smartphones', 'I love using apps']
|
|
||||||
|
# RETURNS The CoordinationSplitter component.
|
||||||
Args:
|
|
||||||
doc (Doc): The spaCy Doc object.
|
# DOCS: xxx
|
||||||
|
# """
|
||||||
Returns:
|
|
||||||
List[str]: The split texts.
|
# return CoordinationSplitter(nlp.vocab, name=name)
|
||||||
"""
|
|
||||||
|
|
||||||
for token in doc:
|
# class CoordinationSplitter(Pipe):
|
||||||
|
# def __init__(
|
||||||
if token.pos_ == "VERB" and token.dep_ == "ROOT":
|
# self,
|
||||||
|
# vocab: Vocab,
|
||||||
has_AND = False
|
# name: str = "coordination_splitter",
|
||||||
has_dobj = False
|
# rules: Optional[List[SplittingRule]] = None,
|
||||||
has_sec_obj = False
|
# ) -> None:
|
||||||
subject = ""
|
# self.name = name
|
||||||
|
# self.vocab = vocab
|
||||||
for child in token.children:
|
# if rules is None:
|
||||||
|
# default_rules = [
|
||||||
if child.dep_ == "dobj":
|
# _split_duplicate_object,
|
||||||
has_dobj = True
|
# _split_duplicate_verb,
|
||||||
|
# _split_skill_mentions,
|
||||||
subject = child.text if child.dep_ == "nsubj" else subject
|
# ]
|
||||||
|
# self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
||||||
objects = " ".join(
|
# else:
|
||||||
[
|
# # Ensure provided rules are wrapped in SplittingRule instances
|
||||||
c.text
|
# self.rules = [
|
||||||
for c in token.subtree
|
# rule
|
||||||
if c.text != token.text and c.dep_ != "nsubj"
|
# if isinstance(rule, SplittingRule)
|
||||||
]
|
# else SplittingRule(function=rule)
|
||||||
)
|
# for rule in rules
|
||||||
|
# ]
|
||||||
split_objects = _split_on_and(objects)
|
|
||||||
|
# def clear_rules(self) -> None:
|
||||||
object_list = []
|
# """Clear the default splitting rules."""
|
||||||
for split in split_objects:
|
# self.rules = []
|
||||||
object_list.append(split)
|
|
||||||
|
# def add_default_rules(self) -> List[SplittingRule]:
|
||||||
for subchild in child.children:
|
# """Reset the default splitting rules."""
|
||||||
|
# default_rules = [
|
||||||
if subchild.pos_ == "CCONJ" and subchild.lemma_ == "and":
|
# _split_duplicate_object,
|
||||||
has_AND = True
|
# _split_duplicate_verb,
|
||||||
|
# _split_skill_mentions,
|
||||||
if subchild.dep_ == "conj":
|
# ]
|
||||||
has_sec_obj = True
|
# self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
||||||
|
|
||||||
if has_AND and has_dobj and has_sec_obj:
|
# def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
|
||||||
text_list = [
|
# """Add a single splitting rule to the default rules."""
|
||||||
f"{subject} {token.text} {split}.".strip()
|
# validated_rule = SplittingRule(function=rule)
|
||||||
for split in object_list
|
# self.rules.append(validated_rule)
|
||||||
]
|
|
||||||
return [text.replace(" ..", ".") for text in text_list]
|
# def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
|
||||||
|
# """Add a list of splitting rules to the default rules.
|
||||||
return None
|
|
||||||
|
# Args:
|
||||||
|
# rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
|
||||||
def _split_skill_mentions(doc: Doc) -> Union[List[str], None]:
|
# """
|
||||||
"""Split a text with 2 skills into 2 texts with 1 skill.
|
# for rule in rules:
|
||||||
|
# # Wrap each rule in a SplittingRule instance to ensure it's validated
|
||||||
i.e. 'written and oral communication skills' -->
|
# validated_rule = SplittingRule(function=rule)
|
||||||
['written communication skills', 'oral communication skills']
|
# self.rules.append(validated_rule)
|
||||||
|
|
||||||
Args:
|
# def __call__(self, doc: Doc) -> Doc:
|
||||||
text (str): The text to split.
|
# """Apply the splitting rules to the doc.
|
||||||
|
|
||||||
Returns:
|
# Args:
|
||||||
List[str]: The split texts.
|
# doc (Doc): The spaCy Doc object.
|
||||||
"""
|
|
||||||
for token in doc:
|
# Returns:
|
||||||
if (
|
# Doc: The modified spaCy Doc object.
|
||||||
token.pos_ == "NOUN"
|
# """
|
||||||
and token.lemma_ == "skill"
|
# if doc.lang_ != "en":
|
||||||
and token.idx == doc[-1].idx
|
# return doc
|
||||||
):
|
|
||||||
|
# for rule in self.rules:
|
||||||
has_AND = False
|
# split = rule.function(doc)
|
||||||
|
# if split:
|
||||||
root = [token for token in doc if token.dep_ == "ROOT"]
|
# return Doc(doc.vocab, words=split)
|
||||||
if root:
|
# return doc
|
||||||
root = root[0]
|
|
||||||
|
|
||||||
for child in root.subtree:
|
|
||||||
|
|
||||||
if child.pos_ == "CCONJ" and child.lemma_ == "and":
|
|
||||||
has_AND = True
|
|
||||||
|
|
||||||
if has_AND:
|
|
||||||
skill_def = " ".join(
|
|
||||||
[c.text for c in root.subtree if c.text != token.text]
|
|
||||||
)
|
|
||||||
|
|
||||||
split_skills = _split_on_and(skill_def)
|
|
||||||
|
|
||||||
skill_lists = []
|
|
||||||
for split_skill in split_skills:
|
|
||||||
skill_lists.append("{} {}".format(split_skill, token.text))
|
|
||||||
|
|
||||||
return skill_lists
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
class SplittingRule(BaseModel):
|
|
||||||
function: Callable[[Doc], Union[List[str], None]]
|
|
||||||
|
|
||||||
@validator("function")
|
|
||||||
def check_return_type(cls, v):
|
|
||||||
nlp = en_core_web_sm.load()
|
|
||||||
dummy_doc = nlp("This is a dummy sentence.")
|
|
||||||
result = v(dummy_doc)
|
|
||||||
if result is not None:
|
|
||||||
if not isinstance(result, List):
|
|
||||||
raise ValueError(
|
|
||||||
"The custom splitting rule must return None or a list."
|
|
||||||
)
|
|
||||||
elif not all(isinstance(item, str) for item in result):
|
|
||||||
raise ValueError(
|
|
||||||
"The custom splitting rule must return None or a list of strings."
|
|
||||||
)
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
@Language.factory(
|
|
||||||
"coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
|
|
||||||
)
|
|
||||||
def make_coordination_splitter(nlp: Language, name: str):
|
|
||||||
"""Make a CoordinationSplitter component.
|
|
||||||
|
|
||||||
the default splitting rules include:
|
|
||||||
|
|
||||||
- _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
|
|
||||||
- _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
|
|
||||||
- _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
|
|
||||||
|
|
||||||
|
|
||||||
Args:
|
|
||||||
nlp (Language): The spaCy Language object.
|
|
||||||
name (str): The name of the component.
|
|
||||||
|
|
||||||
RETURNS The CoordinationSplitter component.
|
|
||||||
|
|
||||||
DOCS: xxx
|
|
||||||
"""
|
|
||||||
|
|
||||||
return CoordinationSplitter(nlp.vocab, name=name)
|
|
||||||
|
|
||||||
|
|
||||||
class CoordinationSplitter(Pipe):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocab: Vocab,
|
|
||||||
name: str = "coordination_splitter",
|
|
||||||
rules: Optional[List[SplittingRule]] = None,
|
|
||||||
) -> None:
|
|
||||||
self.name = name
|
|
||||||
self.vocab = vocab
|
|
||||||
if rules is None:
|
|
||||||
default_rules = [
|
|
||||||
_split_duplicate_object,
|
|
||||||
_split_duplicate_verb,
|
|
||||||
_split_skill_mentions,
|
|
||||||
]
|
|
||||||
self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
|
||||||
else:
|
|
||||||
# Ensure provided rules are wrapped in SplittingRule instances
|
|
||||||
self.rules = [
|
|
||||||
rule
|
|
||||||
if isinstance(rule, SplittingRule)
|
|
||||||
else SplittingRule(function=rule)
|
|
||||||
for rule in rules
|
|
||||||
]
|
|
||||||
|
|
||||||
def clear_rules(self) -> None:
|
|
||||||
"""Clear the default splitting rules."""
|
|
||||||
self.rules = []
|
|
||||||
|
|
||||||
def add_default_rules(self) -> List[SplittingRule]:
|
|
||||||
"""Reset the default splitting rules."""
|
|
||||||
default_rules = [
|
|
||||||
_split_duplicate_object,
|
|
||||||
_split_duplicate_verb,
|
|
||||||
_split_skill_mentions,
|
|
||||||
]
|
|
||||||
self.rules = [SplittingRule(function=rule) for rule in default_rules]
|
|
||||||
|
|
||||||
def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
|
|
||||||
"""Add a single splitting rule to the default rules."""
|
|
||||||
validated_rule = SplittingRule(function=rule)
|
|
||||||
self.rules.append(validated_rule)
|
|
||||||
|
|
||||||
def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
|
|
||||||
"""Add a list of splitting rules to the default rules.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
|
|
||||||
"""
|
|
||||||
for rule in rules:
|
|
||||||
# Wrap each rule in a SplittingRule instance to ensure it's validated
|
|
||||||
validated_rule = SplittingRule(function=rule)
|
|
||||||
self.rules.append(validated_rule)
|
|
||||||
|
|
||||||
def __call__(self, doc: Doc) -> Doc:
|
|
||||||
"""Apply the splitting rules to the doc.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
doc (Doc): The spaCy Doc object.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Doc: The modified spaCy Doc object.
|
|
||||||
"""
|
|
||||||
if doc.lang_ != "en":
|
|
||||||
return doc
|
|
||||||
|
|
||||||
for rule in self.rules:
|
|
||||||
split = rule.function(doc)
|
|
||||||
if split:
|
|
||||||
return Doc(doc.vocab, words=split)
|
|
||||||
return doc
|
|
||||||
|
|
|
@ -1,66 +1,159 @@
|
||||||
import pytest
|
import pytest
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from spacy.tokens import Doc
|
from spacy.tokens import Doc
|
||||||
|
import spacy
|
||||||
|
|
||||||
import en_core_web_sm
|
from spacy.pipeline.coordinationruler import split_noun_coordination
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def nlp():
|
def nlp():
|
||||||
return en_core_web_sm.load()
|
return spacy.blank("en")
|
||||||
|
|
||||||
|
### NOUN CONSTRUCTION CASES ###
|
||||||
|
@pytest.fixture
|
||||||
|
def noun_construction_case1(nlp):
|
||||||
|
words = ["apples", "and", "oranges"]
|
||||||
|
spaces = [True, True, False] # Indicates whether the word is followed by a space
|
||||||
|
pos_tags = ["NOUN", "CCONJ", "NOUN"]
|
||||||
|
dep_relations = ["nsubj", "cc", "conj"]
|
||||||
|
|
||||||
def _my_custom_splitting_rule(doc: Doc) -> List[str]:
|
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
split_phrases = []
|
|
||||||
for token in doc:
|
|
||||||
if token.text == "read":
|
|
||||||
split_phrases.append("test1")
|
|
||||||
split_phrases.append("test2")
|
|
||||||
return split_phrases
|
|
||||||
|
|
||||||
|
#set pos_ and dep_ attributes
|
||||||
|
for token, pos, dep in zip(doc, pos_tags, dep_relations):
|
||||||
|
token.pos_ = pos
|
||||||
|
token.dep_ = dep
|
||||||
|
|
||||||
|
# # define head relationships manually
|
||||||
|
doc[1].head = doc[2] # "and" -> "oranges"
|
||||||
|
doc[2].head = doc[0] # "oranges" -> "apples"
|
||||||
|
doc[0].head = doc[0]
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def noun_construction_case2(nlp):
|
||||||
|
words = ["red", "apples", "and", "oranges"]
|
||||||
|
spaces = [True, True, True, False] # Indicates whether the word is followed by a space
|
||||||
|
pos_tags = ["ADJ", "NOUN", "CCONJ", "NOUN"]
|
||||||
|
dep_relations = ["amod", "nsubj", "cc", "conj"]
|
||||||
|
|
||||||
def test_coordinationruler(nlp):
|
# Create a Doc object manually
|
||||||
doc = nlp("I read and write books")
|
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
assert len(doc) == 5
|
|
||||||
assert [d.text for d in doc] == ["I", "read", "and", "write", "books"]
|
|
||||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
|
||||||
assert len(coord_splitter.rules) == 3
|
|
||||||
assert coord_splitter.name == "coordination_splitter"
|
|
||||||
doc_split = coord_splitter(doc)
|
|
||||||
assert len(doc_split) == 2
|
|
||||||
assert [t.text for t in doc_split] == ["I read books", "I write books"]
|
|
||||||
|
|
||||||
|
#set pos_ and dep_ attributes
|
||||||
|
for token, pos, dep in zip(doc, pos_tags, dep_relations):
|
||||||
|
token.pos_ = pos
|
||||||
|
token.dep_ = dep
|
||||||
|
|
||||||
|
# define head relationships manually
|
||||||
|
doc[0].head = doc[1]
|
||||||
|
doc[2].head = doc[3]
|
||||||
|
doc[3].head = doc[1]
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
def test_coordinationruler_clear_rules(nlp):
|
@pytest.fixture
|
||||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
def noun_construction_case3(nlp):
|
||||||
assert len(coord_splitter.rules) == 3
|
words = ["apples", "and", "juicy", "oranges"]
|
||||||
coord_splitter.clear_rules()
|
spaces = [True, True, True, False] # Indicates whether the word is followed by a space.
|
||||||
assert len(coord_splitter.rules) == 0
|
pos_tags = ["NOUN", "CCONJ", "ADJ", "NOUN"]
|
||||||
assert coord_splitter.rules == []
|
dep_relations = ["nsubj", "cc", "amod", "conj"]
|
||||||
|
|
||||||
|
#create a Doc object manually
|
||||||
|
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
def test_coordinationruler_add_rule(nlp):
|
#set POS and dependency tags
|
||||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
for token, pos, dep in zip(doc, pos_tags, dep_relations):
|
||||||
assert len(coord_splitter.rules) == 3
|
token.pos_ = pos
|
||||||
coord_splitter.add_rule(_my_custom_splitting_rule)
|
token.dep_ = dep
|
||||||
assert len(coord_splitter.rules) == 4
|
|
||||||
|
|
||||||
|
#defining head relationships manually
|
||||||
|
doc[0].head = doc[0] # "apples" as root, pointing to itself for simplicity.
|
||||||
|
doc[1].head = doc[3] # "and" -> "oranges"
|
||||||
|
doc[2].head = doc[3] # "juicy" -> "oranges"
|
||||||
|
doc[3].head = doc[0] # "oranges" -> "apples", indicating a conjunctive relationship
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
def test_coordinationruler_add_rules(nlp):
|
@pytest.fixture
|
||||||
doc = nlp("I read and write books")
|
def noun_construction_case4(nlp):
|
||||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
words = ["hot", "chicken", "wings", "and", "soup"]
|
||||||
coord_splitter.clear_rules()
|
spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space.
|
||||||
coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule])
|
pos_tags= ["ADJ", "NOUN", "NOUN", "CCONJ", "NOUN"]
|
||||||
assert len(coord_splitter.rules) == 2
|
dep_relations = ["amod", "compound", "ROOT", "cc", "conj"]
|
||||||
doc_split = coord_splitter(doc)
|
|
||||||
assert len(doc_split) == 2
|
|
||||||
|
|
||||||
assert [t.text for t in doc_split] == ["test1", "test2"]
|
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
for token, pos, dep in zip(doc, pos_tags, dep_relations):
|
||||||
|
token.pos_ = pos
|
||||||
|
token.dep_ = dep
|
||||||
|
|
||||||
def test_coordinationruler_add_default_rules(nlp):
|
# Define head relationships manually for "hot chicken wings and soup".
|
||||||
coord_splitter = nlp.add_pipe("coordination_splitter")
|
doc[0].head = doc[2] # "hot" -> "wings"
|
||||||
coord_splitter.clear_rules()
|
doc[1].head = doc[2] # "chicken" -> "wings"
|
||||||
assert len(coord_splitter.rules) == 0
|
doc[2].head = doc[2] # "wings" as root
|
||||||
coord_splitter.add_default_rules()
|
doc[3].head = doc[4] # "and" -> "soup"
|
||||||
assert len(coord_splitter.rules) == 3
|
doc[4].head = doc[2] # "soup" -> "wings"
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def noun_construction_case5(nlp):
|
||||||
|
words = ["green", "apples", "and", "rotten", "oranges"]
|
||||||
|
spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space.
|
||||||
|
pos_tags = ["ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"]
|
||||||
|
dep_relations = ["amod", "ROOT", "cc", "amod", "conj"]
|
||||||
|
|
||||||
|
doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||||
|
|
||||||
|
# Set POS and dependency tags.
|
||||||
|
for token, pos, dep in zip(doc, pos_tags, dep_relations):
|
||||||
|
token.pos_ = pos
|
||||||
|
token.dep_ = dep
|
||||||
|
|
||||||
|
# Define head relationships manually for "green apples and rotten oranges".
|
||||||
|
doc[0].head = doc[1] # "green" -> "apples"
|
||||||
|
doc[1].head = doc[1] # "apples" as root
|
||||||
|
doc[2].head = doc[4] # "and" -> "oranges"
|
||||||
|
doc[3].head = doc[4] # "rotten" -> "oranges"
|
||||||
|
doc[4].head = doc[1] # "oranges" -> "apples"
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
#test split_noun_coordination on 5 different cases
|
||||||
|
def test_split_noun_coordination(noun_construction_case1,
|
||||||
|
noun_construction_case2,
|
||||||
|
noun_construction_case3,
|
||||||
|
noun_construction_case4,
|
||||||
|
noun_construction_case5):
|
||||||
|
|
||||||
|
#test 1: no modifier - it should return None from _split_doc
|
||||||
|
case1_split = split_noun_coordination(noun_construction_case1)
|
||||||
|
assert case1_split == None
|
||||||
|
|
||||||
|
#test 2: modifier is at the beginning of the noun phrase
|
||||||
|
case2_split = split_noun_coordination(noun_construction_case2)
|
||||||
|
assert len(case2_split) == 2
|
||||||
|
assert isinstance(case2_split, list)
|
||||||
|
assert all(isinstance(phrase, str) for phrase in case2_split)
|
||||||
|
assert case2_split == ["red apples", "red oranges"]
|
||||||
|
|
||||||
|
#test 3: modifier is at the end of the noun phrase
|
||||||
|
case3_split = split_noun_coordination(noun_construction_case3)
|
||||||
|
assert len(case3_split) == 2
|
||||||
|
assert isinstance(case3_split, list)
|
||||||
|
assert all(isinstance(phrase, str) for phrase in case3_split)
|
||||||
|
assert case3_split == ["juicy apples", "juicy oranges"]
|
||||||
|
|
||||||
|
#test 4: deal with compound nouns
|
||||||
|
case4_split = split_noun_coordination(noun_construction_case4)
|
||||||
|
assert len(case4_split) == 2
|
||||||
|
assert isinstance(case4_split, list)
|
||||||
|
assert all(isinstance(phrase, str) for phrase in case4_split)
|
||||||
|
assert case4_split == ["hot chicken wings", "hot soup"]
|
||||||
|
|
||||||
|
#test 5: multiple modifiers
|
||||||
|
case5_split = split_noun_coordination(noun_construction_case5)
|
||||||
|
assert case5_split == None
|
Loading…
Reference in New Issue
Block a user