mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			74 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			74 lines
		
	
	
		
			2.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # coding: utf8
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from .doc import Doc
 | |
| from ..symbols import HEAD, TAG, DEP, ENT_IOB, ENT_TYPE
 | |
| 
 | |
| 
 | |
| def merge_ents(doc):
 | |
|     """Helper: merge adjacent entities into single tokens; modifies the doc."""
 | |
|     for ent in doc.ents:
 | |
|         ent.merge(ent.root.tag_, ent.text, ent.label_)
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| def format_POS(token, light, flat):
 | |
|     """Helper: form the POS output for a token."""
 | |
|     subtree = dict([
 | |
|         ("word", token.text),
 | |
|         ("lemma", token.lemma_),  # trigger
 | |
|         ("NE", token.ent_type_),  # trigger
 | |
|         ("POS_fine", token.tag_),
 | |
|         ("POS_coarse", token.pos_),
 | |
|         ("arc", token.dep_),
 | |
|         ("modifiers", [])
 | |
|     ])
 | |
|     if light:
 | |
|         subtree.pop("lemma")
 | |
|         subtree.pop("NE")
 | |
|     if flat:
 | |
|         subtree.pop("arc")
 | |
|         subtree.pop("modifiers")
 | |
|     return subtree
 | |
| 
 | |
| 
 | |
| def POS_tree(root, light=False, flat=False):
 | |
|     """Helper: generate a POS tree for a root token. The doc must have
 | |
|     `merge_ents(doc)` ran on it.
 | |
|     """
 | |
|     subtree = format_POS(root, light=light, flat=flat)
 | |
|     for c in root.children:
 | |
|         subtree["modifiers"].append(POS_tree(c))
 | |
|     return subtree
 | |
| 
 | |
| 
 | |
| def parse_tree(doc, light=False, flat=False):
 | |
|     """Makes a copy of the doc, then construct a syntactic parse tree, similar to
 | |
|     the one used in displaCy. Generates the POS tree for all sentences in a doc.
 | |
| 
 | |
|     doc (Doc): The doc for parsing.
 | |
|     RETURNS (dict): The parse tree.
 | |
| 
 | |
|     EXAMPLE:
 | |
|         >>> doc = nlp('Bob brought Alice the pizza. Alice ate the pizza.')
 | |
|         >>> trees = doc.print_tree()
 | |
|         >>> trees[1]
 | |
|         {'modifiers': [
 | |
|             {'modifiers': [], 'NE': 'PERSON', 'word': 'Alice', 'arc': 'nsubj',
 | |
|              'POS_coarse': 'PROPN', 'POS_fine': 'NNP', 'lemma': 'Alice'},
 | |
|             {'modifiers': [
 | |
|                 {'modifiers': [], 'NE': '', 'word': 'the', 'arc': 'det',
 | |
|                  'POS_coarse': 'DET', 'POS_fine': 'DT', 'lemma': 'the'}],
 | |
|              'NE': '', 'word': 'pizza', 'arc': 'dobj', 'POS_coarse': 'NOUN',
 | |
|              'POS_fine': 'NN', 'lemma': 'pizza'},
 | |
|             {'modifiers': [], 'NE': '', 'word': '.', 'arc': 'punct',
 | |
|              'POS_coarse': 'PUNCT', 'POS_fine': '.', 'lemma': '.'}],
 | |
|             'NE': '', 'word': 'ate', 'arc': 'ROOT', 'POS_coarse': 'VERB',
 | |
|             'POS_fine': 'VBD', 'lemma': 'eat'}
 | |
|     """
 | |
|     doc_clone  = Doc(doc.vocab, words=[w.text for w in doc])
 | |
|     doc_clone.from_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE],
 | |
|                          doc.to_array([HEAD, TAG, DEP, ENT_IOB, ENT_TYPE]))
 | |
|     merge_ents(doc_clone)  # merge the entities into single tokens first
 | |
|     return [POS_tree(sent.root, light=light, flat=flat) for sent in doc_clone.sents]
 |