From d82d98b374d30c759c155b5e0e79fd9ace5582db Mon Sep 17 00:00:00 2001 From: India Kerle Date: Mon, 4 Mar 2024 09:34:02 -0300 Subject: [PATCH] update splitter --- spacy/pipeline/__init__.py | 4 +- spacy/pipeline/coordinationruler.py | 341 +++++++++------- .../tests/pipeline/test_coordinationruler.py | 373 ++++++++++++++---- 3 files changed, 503 insertions(+), 215 deletions(-) diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index 52e30ad4f..02c900310 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,5 +1,5 @@ from .attributeruler import AttributeRuler -#from .coordinationruler import CoordinationSplitter +from .coordinationruler import CoordinationSplitter from .dep_parser import DependencyParser from .edit_tree_lemmatizer import EditTreeLemmatizer from .entity_linker import EntityLinker @@ -22,7 +22,7 @@ from .trainable_pipe import TrainablePipe __all__ = [ "AttributeRuler", - #"CoordinationSplitter", + "CoordinationSplitter", "DependencyParser", "EditTreeLemmatizer", "EntityLinker", diff --git a/spacy/pipeline/coordinationruler.py b/spacy/pipeline/coordinationruler.py index e171dca9b..5eeea7ecc 100644 --- a/spacy/pipeline/coordinationruler.py +++ b/spacy/pipeline/coordinationruler.py @@ -1,66 +1,130 @@ -from typing import List, Callable, Optional, Union -from pydantic import BaseModel, validator import re +from typing import Callable, List, Optional, Union + +from pydantic import BaseModel, validator -from ..tokens import Doc from ..language import Language +from ..tokens import Doc, Token from ..vocab import Vocab from .pipe import Pipe -########### DEFAULT COORDINATION SPLITTING RULES ############## - -def split_noun_coordination(doc: Doc) -> Union[List[str], None]: - """Identifies and splits phrases with multiple nouns, a modifier - and a conjunction. - - Examples: - - "apples and oranges" -> None - - "green apples and oranges" -> ["green apples", "green oranges"] - - "green apples and rotten oranges" -> None - - "apples and juicy oranges" -> ["juicy apples", "juicy oranges"] - - "hot chicken wings and soup" -> ["hot chicken wings", "hot soup"] - - "spicy ice cream and chicken wings" -> ["spicy ice cream", "spicy chicken wings"] - +######### helper functions across the default splitting rules ############## + + +def _split_doc(doc: Doc) -> bool: + """Check to see if the document has a noun phrase + with a modifier and a conjunction. + Args: doc (Doc): The input document. Returns: - Union[List[str], None]: A list of the coordinated noun phrases, + bool: True if the document has a noun phrase + with a modifier and a conjunction, else False. + """ + + noun_modified = False + has_conjunction = False + + for token in doc: + if token.head.pos_ == "NOUN": ## check to see that the phrase is a noun phrase + has_modifier = any( + child.dep_ == "amod" for child in token.head.children + ) # check to see if the noun has a modifier + if has_modifier: + noun_modified = True + + # check if there is a conjunction in the phrase + if token.pos_ == "CCONJ": + has_conjunction = True + + return ( + True if noun_modified and has_conjunction else False + ) # and not all_nouns_modified else False + + +def _collect_modifiers(token: Token) -> List[str]: + """Collects adverbial modifiers for a given token. + + Args: + token (Token): The input token. + + Returns: + List[str]: A list of modifiers for the token. + """ + modifiers = [] + for child in token.children: + if child.dep_ == "amod": + # collect adverbial modifiers for this adjective + adv_mods = [ + adv_mod.text + for adv_mod in child.children + if adv_mod.dep_ in ["advmod"] and not adv_mod.pos_ == "CCONJ" + ] + + modifier_phrase = " ".join(adv_mods + [child.text]) + modifiers.append(modifier_phrase) + # also check for conjunctions to this adjective + for conj in child.conjuncts: + adv_mods_conj = [ + adv_mod.text + for adv_mod in conj.children + if adv_mod.dep_ in ["advmod"] and not adv_mod.pos_ == "CCONJ" + ] + modifier_phrase_conj = " ".join(adv_mods_conj + [conj.text]) + modifiers.append(modifier_phrase_conj) + + return modifiers + + +########### DEFAULT COORDINATION SPLITTING RULES ############## + + +def split_noun_coordination(doc: Doc) -> Union[List[str], None]: + """Identifies and splits noun phrases with a modifier + and a conjunction. + + construction cases: + - "apples and oranges" -> None + - "green apples and oranges" -> ["green apples", "green oranges"] + - "apples and juicy oranges" -> ["juicy apples", "juicy oranges"] + - "hot chicken wings and soup" -> ["hot chicken wings", "hot soup"] + - "green apples and rotten oranges" -> ["green apples", "rotten oranges"] + - "very green apples and oranges" -> ["very green apples", "very green oranges"] + - "delicious and juicy apples" -> ["delicious apples", "juicy apples"] + - "delicious but quite sour apples" -> ["delicious apples", "quite sour apples"] + - "delicious but quite sour apples and oranges" -> ["delicious apples", "quite sour apples", "delicious oranges", "quite sour oranges"] + + Args: + doc (Doc): The input document. + + Returns: + Union[List[str], None]: A list of the coordinated noun phrases, or None if no coordinated noun phrases are found. """ - def _split_doc(doc: Doc) -> bool: - noun_modified = False - has_conjunction = False - - for token in doc: - if token.head.pos_ == 'NOUN': ## check to see that the phrase is a noun phrase - has_modifier = any(child.dep_ == 'amod' for child in token.head.children) #check to see if the noun has a modifier - if has_modifier: - noun_modified = True - # check if there is a conjunction linked directly to a noun - if token.dep_ == 'conj' and token.head.pos_ == 'NOUN': - has_conjunction = True - - return True if noun_modified and has_conjunction else False - phrases = [] - modified_nouns = set() + modified_nouns = set() to_split = _split_doc(doc) - - if to_split: + + if to_split: for token in doc: if token.dep_ == "amod" and token.head.pos_ == "NOUN": - modifier = token.text head_noun = token.head - + if head_noun not in modified_nouns: + modifier_phrases = _collect_modifiers(head_noun) nouns_to_modify = [head_noun] + list(head_noun.conjuncts) - + for noun in nouns_to_modify: - compound_parts = [child.text for child in noun.lefts if child.dep_ == "compound"] - complete_noun_phrase = " ".join(compound_parts + [noun.text]) - phrases.append(f"{modifier} {complete_noun_phrase}") - modified_nouns.add(noun) # Mark this noun as modified + compound_parts = [ + child.text + for child in noun.lefts + if child.dep_ == "compound" + ] + complete_noun_phrase = " ".join(compound_parts + [noun.text]) + for modifier_phrase in modifier_phrases: + phrases.append(f"{modifier_phrase} {complete_noun_phrase}") + modified_nouns.add(noun) # mark this noun as modified return phrases if phrases != [] else None else: @@ -69,119 +133,110 @@ def split_noun_coordination(doc: Doc) -> Union[List[str], None]: ############################################################### -# class SplittingRule(BaseModel): -# function: Callable[[Doc], Union[List[str], None]] -# @validator("function") -# def check_return_type(cls, v): -# nlp = en_core_web_sm.load() -# dummy_doc = nlp("This is a dummy sentence.") -# result = v(dummy_doc) -# if result is not None: -# if not isinstance(result, List): -# raise ValueError( -# "The custom splitting rule must return None or a list." -# ) -# elif not all(isinstance(item, str) for item in result): -# raise ValueError( -# "The custom splitting rule must return None or a list of strings." -# ) -# return v +class SplittingRule(BaseModel): + function: Callable[[Doc], Union[List[str], None]] + + @validator("function") + def check_return_type(cls, v): + dummy_doc = Doc(Language().vocab, words=["dummy", "doc"], spaces=[True, False]) + result = v(dummy_doc) + if result is not None: + if not isinstance(result, List): + raise ValueError( + "The custom splitting rule must return None or a list." + ) + elif not all(isinstance(item, str) for item in result): + raise ValueError( + "The custom splitting rule must return None or a list of strings." + ) + return v -# @Language.factory( -# "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"] -# ) -# def make_coordination_splitter(nlp: Language, name: str): -# """Make a CoordinationSplitter component. +@Language.factory( + "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"] +) +def make_coordination_splitter(nlp: Language, name: str): + """Make a CoordinationSplitter component. -# the default splitting rules include: + the default splitting rules include: + - split_noun_coordination -# - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present. -# - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object. -# - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and') + Args: + nlp (Language): The spaCy Language object. + name (str): The name of the component. + + RETURNS The CoordinationSplitter component. + + DOCS: xxx + """ + + return CoordinationSplitter(nlp.vocab, name=name) -# Args: -# nlp (Language): The spaCy Language object. -# name (str): The name of the component. +class CoordinationSplitter(Pipe): + def __init__( + self, + vocab: Vocab, + name: str = "coordination_splitter", + rules: Optional[List[SplittingRule]] = None, + ) -> None: + self.name = name + self.vocab = vocab + if rules is None: + default_rules = [ + split_noun_coordination, + ] + self.rules = [SplittingRule(function=rule) for rule in default_rules] + else: + self.rules = [ + rule + if isinstance(rule, SplittingRule) + else SplittingRule(function=rule) + for rule in rules + ] -# RETURNS The CoordinationSplitter component. + def clear_rules(self) -> None: + """Clear the default splitting rules.""" + self.rules = [] -# DOCS: xxx -# """ + def add_default_rules(self) -> List[SplittingRule]: + """Reset the default splitting rules.""" + default_rules = [ + split_noun_coordination, + ] + self.rules = [SplittingRule(function=rule) for rule in default_rules] -# return CoordinationSplitter(nlp.vocab, name=name) + def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None: + """Add a single splitting rule to the default rules.""" + validated_rule = SplittingRule(function=rule) + self.rules.append(validated_rule) + def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None: + """Add a list of splitting rules to the default rules. -# class CoordinationSplitter(Pipe): -# def __init__( -# self, -# vocab: Vocab, -# name: str = "coordination_splitter", -# rules: Optional[List[SplittingRule]] = None, -# ) -> None: -# self.name = name -# self.vocab = vocab -# if rules is None: -# default_rules = [ -# _split_duplicate_object, -# _split_duplicate_verb, -# _split_skill_mentions, -# ] -# self.rules = [SplittingRule(function=rule) for rule in default_rules] -# else: -# # Ensure provided rules are wrapped in SplittingRule instances -# self.rules = [ -# rule -# if isinstance(rule, SplittingRule) -# else SplittingRule(function=rule) -# for rule in rules -# ] + Args: + rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules. + """ + for rule in rules: + # Wrap each rule in a SplittingRule instance to ensure it's validated + validated_rule = SplittingRule(function=rule) + self.rules.append(validated_rule) -# def clear_rules(self) -> None: -# """Clear the default splitting rules.""" -# self.rules = [] + def __call__(self, doc: Doc) -> Doc: + """Apply the splitting rules to the doc. -# def add_default_rules(self) -> List[SplittingRule]: -# """Reset the default splitting rules.""" -# default_rules = [ -# _split_duplicate_object, -# _split_duplicate_verb, -# _split_skill_mentions, -# ] -# self.rules = [SplittingRule(function=rule) for rule in default_rules] + Args: + doc (Doc): The spaCy Doc object. -# def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None: -# """Add a single splitting rule to the default rules.""" -# validated_rule = SplittingRule(function=rule) -# self.rules.append(validated_rule) + Returns: + Doc: The modified spaCy Doc object. + """ + if doc.lang_ != "en": + return doc -# def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None: -# """Add a list of splitting rules to the default rules. - -# Args: -# rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules. -# """ -# for rule in rules: -# # Wrap each rule in a SplittingRule instance to ensure it's validated -# validated_rule = SplittingRule(function=rule) -# self.rules.append(validated_rule) - -# def __call__(self, doc: Doc) -> Doc: -# """Apply the splitting rules to the doc. - -# Args: -# doc (Doc): The spaCy Doc object. - -# Returns: -# Doc: The modified spaCy Doc object. -# """ -# if doc.lang_ != "en": -# return doc - -# for rule in self.rules: -# split = rule.function(doc) -# if split: -# return Doc(doc.vocab, words=split) -# return doc + for rule in self.rules: + split = rule.function(doc) + if split: + return Doc(doc.vocab, words=split) + return doc diff --git a/spacy/tests/pipeline/test_coordinationruler.py b/spacy/tests/pipeline/test_coordinationruler.py index 08d6c2a3b..7ead426cc 100644 --- a/spacy/tests/pipeline/test_coordinationruler.py +++ b/spacy/tests/pipeline/test_coordinationruler.py @@ -1,87 +1,84 @@ -import pytest from typing import List -from spacy.tokens import Doc -import spacy +import pytest +import spacy from spacy.pipeline.coordinationruler import split_noun_coordination +from spacy.tokens import Doc + @pytest.fixture def nlp(): return spacy.blank("en") -### NOUN CONSTRUCTION CASES ### + +### CONSTRUCTION CASES ### @pytest.fixture def noun_construction_case1(nlp): words = ["apples", "and", "oranges"] - spaces = [True, True, False] # Indicates whether the word is followed by a space + spaces = [True, True, False] pos_tags = ["NOUN", "CCONJ", "NOUN"] dep_relations = ["nsubj", "cc", "conj"] doc = Doc(nlp.vocab, words=words, spaces=spaces) - #set pos_ and dep_ attributes for token, pos, dep in zip(doc, pos_tags, dep_relations): token.pos_ = pos token.dep_ = dep - - # # define head relationships manually - doc[1].head = doc[2] # "and" -> "oranges" - doc[2].head = doc[0] # "oranges" -> "apples" - doc[0].head = doc[0] - + + doc[1].head = doc[2] + doc[2].head = doc[0] + doc[0].head = doc[0] + return doc - + + @pytest.fixture def noun_construction_case2(nlp): words = ["red", "apples", "and", "oranges"] - spaces = [True, True, True, False] # Indicates whether the word is followed by a space + spaces = [True, True, True, False] pos_tags = ["ADJ", "NOUN", "CCONJ", "NOUN"] dep_relations = ["amod", "nsubj", "cc", "conj"] - # Create a Doc object manually doc = Doc(nlp.vocab, words=words, spaces=spaces) - #set pos_ and dep_ attributes for token, pos, dep in zip(doc, pos_tags, dep_relations): token.pos_ = pos token.dep_ = dep - - # define head relationships manually - doc[0].head = doc[1] - doc[2].head = doc[3] - doc[3].head = doc[1] - + + doc[0].head = doc[1] + doc[2].head = doc[3] + doc[3].head = doc[1] + return doc + @pytest.fixture def noun_construction_case3(nlp): words = ["apples", "and", "juicy", "oranges"] - spaces = [True, True, True, False] # Indicates whether the word is followed by a space. + spaces = [True, True, True, False] pos_tags = ["NOUN", "CCONJ", "ADJ", "NOUN"] dep_relations = ["nsubj", "cc", "amod", "conj"] - #create a Doc object manually doc = Doc(nlp.vocab, words=words, spaces=spaces) - #set POS and dependency tags for token, pos, dep in zip(doc, pos_tags, dep_relations): token.pos_ = pos token.dep_ = dep - #defining head relationships manually - doc[0].head = doc[0] # "apples" as root, pointing to itself for simplicity. - doc[1].head = doc[3] # "and" -> "oranges" - doc[2].head = doc[3] # "juicy" -> "oranges" - doc[3].head = doc[0] # "oranges" -> "apples", indicating a conjunctive relationship - + doc[0].head = doc[0] + doc[1].head = doc[3] + doc[2].head = doc[3] + doc[3].head = doc[0] + return doc + @pytest.fixture def noun_construction_case4(nlp): words = ["hot", "chicken", "wings", "and", "soup"] - spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space. - pos_tags= ["ADJ", "NOUN", "NOUN", "CCONJ", "NOUN"] + spaces = [True, True, True, True, False] + pos_tags = ["ADJ", "NOUN", "NOUN", "CCONJ", "NOUN"] dep_relations = ["amod", "compound", "ROOT", "cc", "conj"] doc = Doc(nlp.vocab, words=words, spaces=spaces) @@ -90,77 +87,313 @@ def noun_construction_case4(nlp): token.pos_ = pos token.dep_ = dep - # Define head relationships manually for "hot chicken wings and soup". - doc[0].head = doc[2] # "hot" -> "wings" - doc[1].head = doc[2] # "chicken" -> "wings" - doc[2].head = doc[2] # "wings" as root - doc[3].head = doc[4] # "and" -> "soup" - doc[4].head = doc[2] # "soup" -> "wings" - + doc[0].head = doc[2] + doc[1].head = doc[2] + doc[2].head = doc[2] + doc[3].head = doc[4] + doc[4].head = doc[2] + return doc + @pytest.fixture def noun_construction_case5(nlp): words = ["green", "apples", "and", "rotten", "oranges"] - spaces = [True, True, True, True, False] # Indicates whether the word is followed by a space. + spaces = [True, True, True, True, False] pos_tags = ["ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"] dep_relations = ["amod", "ROOT", "cc", "amod", "conj"] doc = Doc(nlp.vocab, words=words, spaces=spaces) - # Set POS and dependency tags. for token, pos, dep in zip(doc, pos_tags, dep_relations): token.pos_ = pos token.dep_ = dep - # Define head relationships manually for "green apples and rotten oranges". - doc[0].head = doc[1] # "green" -> "apples" - doc[1].head = doc[1] # "apples" as root - doc[2].head = doc[4] # "and" -> "oranges" - doc[3].head = doc[4] # "rotten" -> "oranges" - doc[4].head = doc[1] # "oranges" -> "apples" - + doc[0].head = doc[1] + doc[1].head = doc[1] + doc[2].head = doc[4] + doc[3].head = doc[4] + doc[4].head = doc[1] + return doc -#test split_noun_coordination on 5 different cases -def test_split_noun_coordination(noun_construction_case1, - noun_construction_case2, - noun_construction_case3, - noun_construction_case4, - noun_construction_case5): - - #test 1: no modifier - it should return None from _split_doc + +@pytest.fixture +def noun_construction_case6(nlp): + words = ["very", "green", "apples", "and", "oranges"] + spaces = [True, True, True, True, False] + pos_tags = ["ADV", "ADJ", "NOUN", "CCONJ", "NOUN"] + dep_relations = ["advmod", "amod", "ROOT", "cc", "conj"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[1] + doc[1].head = doc[2] + doc[2].head = doc[2] + doc[3].head = doc[4] + doc[4].head = doc[2] + + return doc + + +@pytest.fixture +def noun_construction_case7(nlp): + words = ["fresh", "and", "juicy", "apples"] + spaces = [True, True, True, False] + pos_tags = ["ADJ", "CCONJ", "ADJ", "NOUN"] + dep_relations = ["amod", "cc", "conj", "ROOT"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[3] + doc[1].head = doc[2] + doc[2].head = doc[0] + doc[3].head = doc[3] + + return doc + + +@pytest.fixture +def noun_construction_case8(nlp): + words = ["fresh", ",", "juicy", "and", "delicious", "apples"] + spaces = [True, True, True, True, True, False] + pos_tags = ["ADJ", "PUNCT", "ADJ", "CCONJ", "ADJ", "NOUN"] + dep_relations = ["amod", "punct", "conj", "cc", "conj", "ROOT"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[5] + doc[1].head = doc[2] + doc[2].head = doc[0] + doc[3].head = doc[4] + doc[4].head = doc[0] + doc[5].head = doc[5] + + return doc + + +@pytest.fixture +def noun_construction_case9(nlp): + words = ["fresh", "and", "quite", "sour", "apples"] + spaces = [True, True, True, True, False] + pos_tags = ["ADJ", "CCONJ", "ADV", "ADJ", "NOUN"] + dep_relations = ["amod", "cc", "advmod", "conj", "ROOT"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[4] + doc[1].head = doc[3] + doc[2].head = doc[3] + doc[3].head = doc[0] + doc[4].head = doc[4] + + return doc + + +@pytest.fixture +def noun_construction_case10(nlp): + words = ["fresh", "but", "quite", "sour", "apples", "and", "chicken", "wings"] + spaces = [True, True, True, True, True, True, True, False] + pos_tags = ["ADJ", "CCONJ", "ADV", "ADJ", "NOUN", "CCONJ", "NOUN", "NOUN"] + dep_relations = ["amod", "cc", "advmod", "conj", "ROOT", "cc", "conj", "compound"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[4] + doc[1].head = doc[3] + doc[2].head = doc[3] + doc[3].head = doc[0] + doc[4].head = doc[4] + doc[5].head = doc[6] + doc[6].head = doc[4] + doc[7].head = doc[6] + + return doc + + +@pytest.fixture +def noun_construction_case11(nlp): + words = ["water", "and", "power", "meters", "and", "electrical", "sockets"] + spaces = [True, True, True, True, True, True, False] + pos_tags = ["NOUN", "CCONJ", "NOUN", "NOUN", "CCONJ", "ADJ", "NOUN"] + dep_relations = ["compound", "cc", "compound", "ROOT", "cc", "amod", "conj"] + + doc = Doc(nlp.vocab, words=words, spaces=spaces) + + for token, pos, dep in zip(doc, pos_tags, dep_relations): + token.pos_ = pos + token.dep_ = dep + + doc[0].head = doc[2] + doc[1].head = doc[2] + doc[2].head = doc[3] + doc[3].head = doc[3] + doc[4].head = doc[6] + doc[5].head = doc[6] + doc[6].head = doc[3] + + return doc + + +### splitting rules ### +def _my_custom_splitting_rule(doc: Doc) -> List[str]: + split_phrases = [] + for token in doc: + if token.text == "red": + split_phrases.append("test1") + split_phrases.append("test2") + return split_phrases + + +# test split_noun_coordination on 6 different cases +def test_split_noun_coordination( + noun_construction_case1, + noun_construction_case2, + noun_construction_case3, + noun_construction_case4, + # noun_construction_case5, + noun_construction_case6, + noun_construction_case7, + noun_construction_case8, + noun_construction_case9, + noun_construction_case10, + noun_construction_case11, +): + + # test 1: no modifier - it should return None from _split_doc case1_split = split_noun_coordination(noun_construction_case1) - + assert case1_split == None - - #test 2: modifier is at the beginning of the noun phrase + + # test 2: modifier is at the beginning of the noun phrase case2_split = split_noun_coordination(noun_construction_case2) - + assert len(case2_split) == 2 assert isinstance(case2_split, list) assert all(isinstance(phrase, str) for phrase in case2_split) assert case2_split == ["red apples", "red oranges"] - - #test 3: modifier is at the end of the noun phrase + # test 3: modifier is at the end of the noun phrase case3_split = split_noun_coordination(noun_construction_case3) assert len(case3_split) == 2 assert isinstance(case3_split, list) assert all(isinstance(phrase, str) for phrase in case3_split) assert case3_split == ["juicy oranges", "juicy apples"] - - #test 4: deal with compound nouns + + # test 4: deal with compound nouns case4_split = split_noun_coordination(noun_construction_case4) assert len(case4_split) == 2 assert isinstance(case4_split, list) assert all(isinstance(phrase, str) for phrase in case4_split) assert case4_split == ["hot chicken wings", "hot soup"] - - - #test 5: multiple modifiers - case5_split = split_noun_coordination(noun_construction_case5) - pass #this should return none i think \ No newline at end of file + # #test 5: multiple modifiers + # case5_split = split_noun_coordination(noun_construction_case5) + # assert case5_split == None + + # test 6: modifier phrases + case6_split = split_noun_coordination(noun_construction_case6) + + assert len(case6_split) == 2 + assert isinstance(case6_split, list) + assert all(isinstance(phrase, str) for phrase in case6_split) + assert case6_split == ["very green apples", "very green oranges"] + + ## test cases for coordinating adjectives + + # test 7: + case7_split = split_noun_coordination(noun_construction_case7) + assert case7_split == ["fresh apples", "juicy apples"] + + # test 8: + case8_split = split_noun_coordination(noun_construction_case8) + assert case8_split == ["fresh apples", "juicy apples", "delicious apples"] + + # test 9: + case9_split = split_noun_coordination(noun_construction_case9) + assert case9_split == ["fresh apples", "quite sour apples"] + + # test 10: + case10_split = split_noun_coordination(noun_construction_case10) + assert case10_split == ["fresh apples", "quite sour apples", "chicken soup"] + + # test 11: + case11_split = split_noun_coordination(noun_construction_case11) + assert case11_split == None + + +################### test factory ############################## + + +def test_coordinationruler(nlp, noun_construction_case2): + assert len(noun_construction_case2) == 4 + assert [d.text for d in noun_construction_case2] == [ + "red", + "apples", + "and", + "oranges", + ] + + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 1 + assert coord_splitter.name == "coordination_splitter" + doc_split = coord_splitter(noun_construction_case2) + assert len(doc_split) == 2 + assert [t.text for t in doc_split] == ["red apples", "red oranges"] + + +def test_coordinationruler_clear_rules(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 1 + coord_splitter.clear_rules() + assert len(coord_splitter.rules) == 0 + assert coord_splitter.rules == [] + + +def test_coordinationruler_add_rule(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + assert len(coord_splitter.rules) == 1 + coord_splitter.add_rule(_my_custom_splitting_rule) + assert len(coord_splitter.rules) == 2 + + +def test_coordinationruler_add_rules(nlp, noun_construction_case2): + + coord_splitter = nlp.add_pipe("coordination_splitter") + coord_splitter.clear_rules() + coord_splitter.add_rules([_my_custom_splitting_rule, _my_custom_splitting_rule]) + assert len(coord_splitter.rules) == 2 + doc_split = coord_splitter(noun_construction_case2) + assert len(doc_split) == 2 + + assert [t.text for t in doc_split] == ["test1", "test2"] + + +def test_coordinationruler_add_default_rules(nlp): + coord_splitter = nlp.add_pipe("coordination_splitter") + coord_splitter.clear_rules() + assert len(coord_splitter.rules) == 0 + coord_splitter.add_default_rules() + assert len(coord_splitter.rules) == 1