mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	Merge pull request #1913 from ohenrik/nb_syntax_iterator
Norwegian Language (nb) - Added french syntax iterator with explanation
This commit is contained in:
		
						commit
						0954e15dda
					
				|  | @ -13,6 +13,12 @@ from ...language import Language | ||||||
| from ...attrs import LANG, NORM | from ...attrs import LANG, NORM | ||||||
| from ...util import update_exc, add_lookups | from ...util import update_exc, add_lookups | ||||||
| 
 | 
 | ||||||
|  | # Borrowing french syntax parser because both languages use | ||||||
|  | # universal dependencies for tagging/parsing. | ||||||
|  | # Read here for more: | ||||||
|  | # https://github.com/explosion/spaCy/pull/1882#issuecomment-361409573 | ||||||
|  | from .syntax_iterators import SYNTAX_ITERATORS | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class NorwegianDefaults(Language.Defaults): | class NorwegianDefaults(Language.Defaults): | ||||||
|     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) |     lex_attr_getters = dict(Language.Defaults.lex_attr_getters) | ||||||
|  | @ -22,6 +28,7 @@ class NorwegianDefaults(Language.Defaults): | ||||||
|     stop_words = STOP_WORDS |     stop_words = STOP_WORDS | ||||||
|     tag_map = TAG_MAP |     tag_map = TAG_MAP | ||||||
|     lemma_lookup = LOOKUP |     lemma_lookup = LOOKUP | ||||||
|  |     syntax_iterators = SYNTAX_ITERATORS | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class Norwegian(Language): | class Norwegian(Language): | ||||||
|  |  | ||||||
							
								
								
									
										42
									
								
								spacy/lang/nb/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										42
									
								
								spacy/lang/nb/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,42 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | from ...symbols import NOUN, PROPN, PRON | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def noun_chunks(obj): | ||||||
|  |     """ | ||||||
|  |     Detect base noun phrases from a dependency parse. Works on both Doc and Span. | ||||||
|  |     """ | ||||||
|  |     labels = ['nsubj', 'nsubj:pass', 'obj', 'iobj', 'ROOT', 'appos', 'nmod', 'nmod:poss'] | ||||||
|  |     doc = obj.doc  # Ensure works on both Doc and Span. | ||||||
|  |     np_deps = [doc.vocab.strings[label] for label in labels] | ||||||
|  |     conj = doc.vocab.strings.add('conj') | ||||||
|  |     np_label = doc.vocab.strings.add('NP') | ||||||
|  |     seen = set() | ||||||
|  |     for i, word in enumerate(obj): | ||||||
|  |         if word.pos not in (NOUN, PROPN, PRON): | ||||||
|  |             continue | ||||||
|  |         # Prevent nested chunks from being produced | ||||||
|  |         if word.i in seen: | ||||||
|  |             continue | ||||||
|  |         if word.dep in np_deps: | ||||||
|  |             if any(w.i in seen for w in word.subtree): | ||||||
|  |                 continue | ||||||
|  |             seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |             yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  |         elif word.dep == conj: | ||||||
|  |             head = word.head | ||||||
|  |             while head.dep == conj and head.head.i < head.i: | ||||||
|  |                 head = head.head | ||||||
|  |             # If the head is an NP, and we're coordinated to it, we're an NP | ||||||
|  |             if head.dep in np_deps: | ||||||
|  |                 if any(w.i in seen for w in word.subtree): | ||||||
|  |                     continue | ||||||
|  |                 seen.update(j for j in range(word.left_edge.i, word.right_edge.i+1)) | ||||||
|  |                 yield word.left_edge.i, word.right_edge.i+1, np_label | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SYNTAX_ITERATORS = { | ||||||
|  |     'noun_chunks': noun_chunks | ||||||
|  | } | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user