mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Add example file to show answer to Issue #252
This commit is contained in:
		
							parent
							
								
									1b83cb9dfa
								
							
						
					
					
						commit
						9b303e158e
					
				
							
								
								
									
										59
									
								
								examples/get_parse_subregions.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								examples/get_parse_subregions.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,59 @@ | ||||||
|  | """Issue #252 | ||||||
|  | 
 | ||||||
|  | Question: | ||||||
|  | 
 | ||||||
|  | In the documents and tutorials the main thing I haven't found is examples on how to break sentences down into small sub thoughts/chunks. The noun_chunks is handy, but having examples on using the token.head to find small (near-complete) sentence chunks would be neat. | ||||||
|  | 
 | ||||||
|  | Lets take the example sentence on https://api.spacy.io/displacy/index.html | ||||||
|  | 
 | ||||||
|  | displaCy uses CSS and JavaScript to show you how computers understand language | ||||||
|  | This sentence has two main parts (XCOMP & CCOMP) according to the breakdown: | ||||||
|  | 
 | ||||||
|  | [displaCy] uses CSS and Javascript [to + show] | ||||||
|  | & | ||||||
|  | show you how computers understand [language] | ||||||
|  | I'm assuming that we can use the token.head to build these groups. In one of your examples you had the following function. | ||||||
|  | 
 | ||||||
|  | def dependency_labels_to_root(token): | ||||||
|  |     '''Walk up the syntactic tree, collecting the arc labels.''' | ||||||
|  |     dep_labels = [] | ||||||
|  |     while token.head is not token: | ||||||
|  |         dep_labels.append(token.dep) | ||||||
|  |         token = token.head | ||||||
|  |     return dep_labels | ||||||
|  | """ | ||||||
|  | from __future__ import print_function, unicode_literals | ||||||
|  | 
 | ||||||
|  | # Answer: | ||||||
|  | # The easiest way is to find the head of the subtree you want, and then use the | ||||||
|  | # `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree` is the | ||||||
|  | # one that does what you're asking for most directly: | ||||||
|  | 
 | ||||||
|  | from spacy.en import English | ||||||
|  | nlp = English() | ||||||
|  | 
 | ||||||
|  | doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') | ||||||
|  | for word in doc: | ||||||
|  |     if word.dep_ in ('xcomp', 'ccomp'): | ||||||
|  |         print(''.join(w.text_with_ws for w in word.subtree)) | ||||||
|  | 
 | ||||||
|  | # It'd probably be better for `word.subtree` to return a `Span` object instead  | ||||||
|  | # of a generator over the tokens. If you want the `Span` you can get it via the  | ||||||
|  | # `.right_edge` and `.left_edge` properties. The `Span` object is nice because  | ||||||
|  | # you can easily get a vector, merge it, etc. | ||||||
|  | 
 | ||||||
|  | doc = nlp(u'displaCy uses CSS and JavaScript to show you how computers understand language') | ||||||
|  | for word in doc: | ||||||
|  |     if word.dep_ in ('xcomp', 'ccomp'): | ||||||
|  |         subtree_span = doc[word.left_edge.i : word.right_edge.i + 1] | ||||||
|  |         print(subtree_span.text, '|', subtree_span.root.text) | ||||||
|  |         print(subtree_span.similarity(doc)) | ||||||
|  |         print(subtree_span.similarity(subtree_span.root)) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # You might also want to select a head, and then select a start and end position by | ||||||
|  | # walking along its children. You could then take the `.left_edge` and `.right_edge` | ||||||
|  | # of those tokens, and use it to calculate a span. | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user