mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	Adding noun_chunks to the DUTCH language model (nl) (#8529)
* ✨ implement noun_chunks for dutch language * copy/paste FR and SV syntax iterators to accomodate UD tags * added tests with dutch text * signed contributor agreement * 🐛 fix noun chunks generator * built from scratch * define noun chunk as a single Noun-Phrase * includes some corner cases debugging (incorrect POS tagging) * test with provided annotated sample (POS, DEP) * ✅ fix failing test * CI pipeline did not like the added sample file * add the sample as a pytest fixture * Update spacy/lang/nl/syntax_iterators.py * Update spacy/lang/nl/syntax_iterators.py Code readability Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Update spacy/tests/lang/nl/test_noun_chunks.py correct comment Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * finalize code * change "if next_word" into "if next_word is not None" Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									2a8eeed5da
								
							
						
					
					
						commit
						e117573822
					
				|  | @ -864,6 +864,9 @@ class Errors: | ||||||
|     E1018 = ("Knowledge base for component '{name}' is not set. " |     E1018 = ("Knowledge base for component '{name}' is not set. " | ||||||
|              "Make sure either `nel.initialize` or `nel.set_kb` " |              "Make sure either `nel.initialize` or `nel.set_kb` " | ||||||
|              "is called with a `kb_loader` function.") |              "is called with a `kb_loader` function.") | ||||||
|  |     E1019 = ("`noun_chunks` requires the pos tagging, which requires a " | ||||||
|  |              "statistical model to be installed and loaded. For more info, see " | ||||||
|  |              "the documentation:\nhttps://spacy.io/usage/models") | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # Deprecated model shortcuts, only used in errors and warnings | # Deprecated model shortcuts, only used in errors and warnings | ||||||
|  |  | ||||||
|  | @ -1,12 +1,14 @@ | ||||||
| from typing import Optional | from typing import Optional | ||||||
|  | 
 | ||||||
| from thinc.api import Model | from thinc.api import Model | ||||||
| 
 | 
 | ||||||
| from .stop_words import STOP_WORDS | from .lemmatizer import DutchLemmatizer | ||||||
| from .lex_attrs import LEX_ATTRS | from .lex_attrs import LEX_ATTRS | ||||||
| from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS |  | ||||||
| from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES | from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_INFIXES | ||||||
| from .punctuation import TOKENIZER_SUFFIXES | from .punctuation import TOKENIZER_SUFFIXES | ||||||
| from .lemmatizer import DutchLemmatizer | from .stop_words import STOP_WORDS | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS | ||||||
| from ...language import Language | from ...language import Language | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -16,6 +18,7 @@ class DutchDefaults(Language.Defaults): | ||||||
|     infixes = TOKENIZER_INFIXES |     infixes = TOKENIZER_INFIXES | ||||||
|     suffixes = TOKENIZER_SUFFIXES |     suffixes = TOKENIZER_SUFFIXES | ||||||
|     lex_attr_getters = LEX_ATTRS |     lex_attr_getters = LEX_ATTRS | ||||||
|  |     syntax_iterators = SYNTAX_ITERATORS | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
							
								
								
									
										72
									
								
								spacy/lang/nl/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								spacy/lang/nl/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,72 @@ | ||||||
|  | from typing import Union, Iterator | ||||||
|  | 
 | ||||||
|  | from ...symbols import NOUN, PRON | ||||||
|  | from ...errors import Errors | ||||||
|  | from ...tokens import Doc, Span | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def noun_chunks(doclike: Union[Doc, Span]) -> Iterator[Span]: | ||||||
|  |     """ | ||||||
|  |     Detect base noun phrases from a dependency parse. Works on Doc and Span. | ||||||
|  |     The definition is inspired by https://www.nltk.org/book/ch07.html | ||||||
|  |     Consider : [Noun + determinant / adjective] and also [Pronoun] | ||||||
|  |     """ | ||||||
|  |     # fmt: off | ||||||
|  |     # labels = ["nsubj", "nsubj:pass", "obj", "iobj", "ROOT", "appos", "nmod", "nmod:poss"] | ||||||
|  |     # fmt: on | ||||||
|  |     doc = doclike.doc  # Ensure works on both Doc and Span. | ||||||
|  | 
 | ||||||
|  |     # Check for dependencies: POS, DEP | ||||||
|  |     if not doc.has_annotation("POS"): | ||||||
|  |         raise ValueError(Errors.E1019) | ||||||
|  |     if not doc.has_annotation("DEP"): | ||||||
|  |         raise ValueError(Errors.E029) | ||||||
|  | 
 | ||||||
|  |     # See UD tags: https://universaldependencies.org/u/dep/index.html | ||||||
|  |     # amod = adjectival modifier | ||||||
|  |     # nmod:poss = possessive nominal modifier | ||||||
|  |     # nummod = numeric modifier | ||||||
|  |     # det = determiner | ||||||
|  |     # det:poss = possessive determiner | ||||||
|  |     noun_deps = [ | ||||||
|  |         doc.vocab.strings[label] for label in ["amod", "nmod:poss", "det", "det:poss"] | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  |     # nsubj = nominal subject | ||||||
|  |     # nsubj:pass = passive nominal subject | ||||||
|  |     pronoun_deps = [doc.vocab.strings[label] for label in ["nsubj", "nsubj:pass"]] | ||||||
|  | 
 | ||||||
|  |     # Label NP for the Span to identify it as Noun-Phrase | ||||||
|  |     span_label = doc.vocab.strings.add("NP") | ||||||
|  | 
 | ||||||
|  |     # Only NOUNS and PRONOUNS matter | ||||||
|  |     for i, word in enumerate(filter(lambda x: x.pos in [PRON, NOUN], doclike)): | ||||||
|  |         # For NOUNS | ||||||
|  |         # Pick children from syntactic parse (only those with certain dependencies) | ||||||
|  |         if word.pos == NOUN: | ||||||
|  |             # Some debugging. It happens that VERBS are POS-TAGGED as NOUNS | ||||||
|  |             # We check if the word has a "nsubj", if it's the case, we eliminate it | ||||||
|  |             nsubjs = filter( | ||||||
|  |                 lambda x: x.dep == doc.vocab.strings["nsubj"], word.children | ||||||
|  |             ) | ||||||
|  |             next_word = next(nsubjs, None) | ||||||
|  |             if next_word is not None: | ||||||
|  |                 # We found some nsubj, so we skip this word. Otherwise, consider it a normal NOUN | ||||||
|  |                 continue | ||||||
|  | 
 | ||||||
|  |             children = filter(lambda x: x.dep in noun_deps, word.children) | ||||||
|  |             children_i = [c.i for c in children] + [word.i] | ||||||
|  | 
 | ||||||
|  |             start_span = min(children_i) | ||||||
|  |             end_span = max(children_i) + 1 | ||||||
|  |             yield start_span, end_span, span_label | ||||||
|  | 
 | ||||||
|  |         # PRONOUNS only if it is the subject of a verb | ||||||
|  |         elif word.pos == PRON: | ||||||
|  |             if word.dep in pronoun_deps: | ||||||
|  |                 start_span = word.i | ||||||
|  |                 end_span = word.i + 1 | ||||||
|  |                 yield start_span, end_span, span_label | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SYNTAX_ITERATORS = {"noun_chunks": noun_chunks} | ||||||
|  | @ -202,6 +202,11 @@ def ne_tokenizer(): | ||||||
|     return get_lang_class("ne")().tokenizer |     return get_lang_class("ne")().tokenizer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @pytest.fixture(scope="session") | ||||||
|  | def nl_vocab(): | ||||||
|  |     return get_lang_class("nl")().vocab | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| @pytest.fixture(scope="session") | @pytest.fixture(scope="session") | ||||||
| def nl_tokenizer(): | def nl_tokenizer(): | ||||||
|     return get_lang_class("nl")().tokenizer |     return get_lang_class("nl")().tokenizer | ||||||
|  |  | ||||||
							
								
								
									
										209
									
								
								spacy/tests/lang/nl/test_noun_chunks.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										209
									
								
								spacy/tests/lang/nl/test_noun_chunks.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,209 @@ | ||||||
|  | from spacy.tokens import Doc | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def nl_sample(nl_vocab): | ||||||
|  |     # TEXT : | ||||||
|  |     # Haar vriend lacht luid. We kregen alweer ruzie toen we de supermarkt ingingen. | ||||||
|  |     # Aan het begin van de supermarkt is al het fruit en de groentes. Uiteindelijk hebben we dan ook | ||||||
|  |     # geen avondeten gekocht. | ||||||
|  |     words = [ | ||||||
|  |         "Haar", | ||||||
|  |         "vriend", | ||||||
|  |         "lacht", | ||||||
|  |         "luid", | ||||||
|  |         ".", | ||||||
|  |         "We", | ||||||
|  |         "kregen", | ||||||
|  |         "alweer", | ||||||
|  |         "ruzie", | ||||||
|  |         "toen", | ||||||
|  |         "we", | ||||||
|  |         "de", | ||||||
|  |         "supermarkt", | ||||||
|  |         "ingingen", | ||||||
|  |         ".", | ||||||
|  |         "Aan", | ||||||
|  |         "het", | ||||||
|  |         "begin", | ||||||
|  |         "van", | ||||||
|  |         "de", | ||||||
|  |         "supermarkt", | ||||||
|  |         "is", | ||||||
|  |         "al", | ||||||
|  |         "het", | ||||||
|  |         "fruit", | ||||||
|  |         "en", | ||||||
|  |         "de", | ||||||
|  |         "groentes", | ||||||
|  |         ".", | ||||||
|  |         "Uiteindelijk", | ||||||
|  |         "hebben", | ||||||
|  |         "we", | ||||||
|  |         "dan", | ||||||
|  |         "ook", | ||||||
|  |         "geen", | ||||||
|  |         "avondeten", | ||||||
|  |         "gekocht", | ||||||
|  |         ".", | ||||||
|  |     ] | ||||||
|  |     heads = [ | ||||||
|  |         1, | ||||||
|  |         2, | ||||||
|  |         2, | ||||||
|  |         2, | ||||||
|  |         2, | ||||||
|  |         6, | ||||||
|  |         6, | ||||||
|  |         6, | ||||||
|  |         6, | ||||||
|  |         13, | ||||||
|  |         13, | ||||||
|  |         12, | ||||||
|  |         13, | ||||||
|  |         6, | ||||||
|  |         6, | ||||||
|  |         17, | ||||||
|  |         17, | ||||||
|  |         24, | ||||||
|  |         20, | ||||||
|  |         20, | ||||||
|  |         17, | ||||||
|  |         24, | ||||||
|  |         24, | ||||||
|  |         24, | ||||||
|  |         24, | ||||||
|  |         27, | ||||||
|  |         27, | ||||||
|  |         24, | ||||||
|  |         24, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |         35, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |         36, | ||||||
|  |     ] | ||||||
|  |     deps = [ | ||||||
|  |         "nmod:poss", | ||||||
|  |         "nsubj", | ||||||
|  |         "ROOT", | ||||||
|  |         "advmod", | ||||||
|  |         "punct", | ||||||
|  |         "nsubj", | ||||||
|  |         "ROOT", | ||||||
|  |         "advmod", | ||||||
|  |         "obj", | ||||||
|  |         "mark", | ||||||
|  |         "nsubj", | ||||||
|  |         "det", | ||||||
|  |         "obj", | ||||||
|  |         "advcl", | ||||||
|  |         "punct", | ||||||
|  |         "case", | ||||||
|  |         "det", | ||||||
|  |         "obl", | ||||||
|  |         "case", | ||||||
|  |         "det", | ||||||
|  |         "nmod", | ||||||
|  |         "cop", | ||||||
|  |         "advmod", | ||||||
|  |         "det", | ||||||
|  |         "ROOT", | ||||||
|  |         "cc", | ||||||
|  |         "det", | ||||||
|  |         "conj", | ||||||
|  |         "punct", | ||||||
|  |         "advmod", | ||||||
|  |         "aux", | ||||||
|  |         "nsubj", | ||||||
|  |         "advmod", | ||||||
|  |         "advmod", | ||||||
|  |         "det", | ||||||
|  |         "obj", | ||||||
|  |         "ROOT", | ||||||
|  |         "punct", | ||||||
|  |     ] | ||||||
|  |     pos = [ | ||||||
|  |         "PRON", | ||||||
|  |         "NOUN", | ||||||
|  |         "VERB", | ||||||
|  |         "ADJ", | ||||||
|  |         "PUNCT", | ||||||
|  |         "PRON", | ||||||
|  |         "VERB", | ||||||
|  |         "ADV", | ||||||
|  |         "NOUN", | ||||||
|  |         "SCONJ", | ||||||
|  |         "PRON", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "NOUN", | ||||||
|  |         "PUNCT", | ||||||
|  |         "ADP", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "ADP", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "AUX", | ||||||
|  |         "ADV", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "CCONJ", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "PUNCT", | ||||||
|  |         "ADJ", | ||||||
|  |         "AUX", | ||||||
|  |         "PRON", | ||||||
|  |         "ADV", | ||||||
|  |         "ADV", | ||||||
|  |         "DET", | ||||||
|  |         "NOUN", | ||||||
|  |         "VERB", | ||||||
|  |         "PUNCT", | ||||||
|  |     ] | ||||||
|  |     return Doc(nl_vocab, words=words, heads=heads, deps=deps, pos=pos) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @pytest.fixture | ||||||
|  | def nl_reference_chunking(): | ||||||
|  |     # Using frog https://github.com/LanguageMachines/frog/ we obtain the following NOUN-PHRASES: | ||||||
|  |     return [ | ||||||
|  |         "haar vriend", | ||||||
|  |         "we", | ||||||
|  |         "ruzie", | ||||||
|  |         "we", | ||||||
|  |         "de supermarkt", | ||||||
|  |         "het begin", | ||||||
|  |         "de supermarkt", | ||||||
|  |         "het fruit", | ||||||
|  |         "de groentes", | ||||||
|  |         "we", | ||||||
|  |         "geen avondeten", | ||||||
|  |     ] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_need_dep(nl_tokenizer): | ||||||
|  |     """ | ||||||
|  |     Test that noun_chunks raises Value Error for 'nl' language if Doc is not parsed. | ||||||
|  |     """ | ||||||
|  |     txt = "Haar vriend lacht luid." | ||||||
|  |     doc = nl_tokenizer(txt) | ||||||
|  | 
 | ||||||
|  |     with pytest.raises(ValueError): | ||||||
|  |         list(doc.noun_chunks) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_chunking(nl_sample, nl_reference_chunking): | ||||||
|  |     """ | ||||||
|  |     Test the noun chunks of a sample text. Uses a sample. | ||||||
|  |     The sample text simulates a Doc object as would be produced by nl_core_news_md. | ||||||
|  |     """ | ||||||
|  |     chunks = [s.text.lower() for s in nl_sample.noun_chunks] | ||||||
|  |     assert chunks == nl_reference_chunking | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user