mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Fix get labels for textcat * Fix char_embed for gpu * Revert "Fix char_embed for gpu" This reverts commit055b9a9e85. * Fix passing of cats in gold.pyx * Revert "Match pop with append for training format (#4516)" This reverts commit8e7414dace. * Fix popping gold parses * Fix handling of cats in gold tuples * Fix name * Fix ner_multitask_objective script * Add test for 4402
		
			
				
	
	
		
			223 lines
		
	
	
		
			8.0 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			223 lines
		
	
	
		
			8.0 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| # coding: utf-8
 | |
| # cython: profile=True
 | |
| # cython: infer_types=True
 | |
| """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005
 | |
| for doing pseudo-projective parsing implementation uses the HEAD decoration
 | |
| scheme.
 | |
| """
 | |
| from __future__ import unicode_literals
 | |
| 
 | |
| from copy import copy
 | |
| 
 | |
| from ..tokens.doc cimport Doc, set_children_from_heads
 | |
| from ..errors import Errors
 | |
| 
 | |
| 
 | |
| DELIMITER = '||'
 | |
| 
 | |
| 
 | |
| def ancestors(tokenid, heads):
 | |
|     # Returns all words going from the word up the path to the root. The path
 | |
|     # to root cannot be longer than the number of words in the  sentence. This
 | |
|     # function ends after at most len(heads) steps, because it would otherwise
 | |
|     # loop indefinitely on cycles.
 | |
|     head = tokenid
 | |
|     cnt = 0
 | |
|     while heads[head] != head and cnt < len(heads):
 | |
|         head = heads[head]
 | |
|         cnt += 1
 | |
|         yield head
 | |
|         if head is None:
 | |
|             break
 | |
| 
 | |
| 
 | |
| def contains_cycle(heads):
 | |
|     # in an acyclic tree, the path from each word following the head relation
 | |
|     # upwards always ends at the root node
 | |
|     for tokenid in range(len(heads)):
 | |
|         seen = set([tokenid])
 | |
|         for ancestor in ancestors(tokenid, heads):
 | |
|             if ancestor in seen:
 | |
|                 return seen
 | |
|             seen.add(ancestor)
 | |
|     return None
 | |
| 
 | |
| 
 | |
| def is_nonproj_arc(tokenid, heads):
 | |
|     # definition (e.g. Havelka 2007): an arc h -> d, h < d is non-projective
 | |
|     # if there is a token k, h < k < d such that h is not
 | |
|     # an ancestor of k. Same for h -> d, h > d
 | |
|     head = heads[tokenid]
 | |
|     if head == tokenid:  # root arcs cannot be non-projective
 | |
|         return False
 | |
|     elif head is None:  # unattached tokens cannot be non-projective
 | |
|         return False
 | |
| 
 | |
|     start, end = (head+1, tokenid) if head < tokenid else (tokenid+1, head)
 | |
|     for k in range(start, end):
 | |
|         for ancestor in ancestors(k, heads):
 | |
|             if ancestor is None:  # for unattached tokens/subtrees
 | |
|                 break
 | |
|             elif ancestor == head:  # normal case: k dominated by h
 | |
|                 break
 | |
|         else:  # head not in ancestors: d -> h is non-projective
 | |
|             return True
 | |
|     return False
 | |
| 
 | |
| 
 | |
| def is_nonproj_tree(heads):
 | |
|     # a tree is non-projective if at least one arc is non-projective
 | |
|     return any(is_nonproj_arc(word, heads) for word in range(len(heads)))
 | |
| 
 | |
| 
 | |
| def decompose(label):
 | |
|     return label.partition(DELIMITER)[::2]
 | |
| 
 | |
| 
 | |
| def is_decorated(label):
 | |
|     return DELIMITER in label
 | |
| 
 | |
| def count_decorated_labels(gold_tuples):
 | |
|     freqs = {}
 | |
|     for raw_text, sents in gold_tuples:
 | |
|         for (ids, words, tags, heads, labels, iob), ctnts in sents:
 | |
|             proj_heads, deco_labels = projectivize(heads, labels)
 | |
|             # set the label to ROOT for each root dependent
 | |
|             deco_labels = ['ROOT' if head == i else deco_labels[i]
 | |
|                            for i, head in enumerate(proj_heads)]
 | |
|             # count label frequencies
 | |
|             for label in deco_labels:
 | |
|                 if is_decorated(label):
 | |
|                     freqs[label] = freqs.get(label, 0) + 1
 | |
|     return freqs
 | |
| 
 | |
| 
 | |
| def preprocess_training_data(gold_tuples, label_freq_cutoff=30):
 | |
|     preprocessed = []
 | |
|     freqs = {}
 | |
|     for raw_text, sents in gold_tuples:
 | |
|         prepro_sents = []
 | |
|         for (ids, words, tags, heads, labels, iob), ctnts in sents:
 | |
|             proj_heads, deco_labels = projectivize(heads, labels)
 | |
|             # set the label to ROOT for each root dependent
 | |
|             deco_labels = ['ROOT' if head == i else deco_labels[i]
 | |
|                            for i, head in enumerate(proj_heads)]
 | |
|             # count label frequencies
 | |
|             if label_freq_cutoff > 0:
 | |
|                 for label in deco_labels:
 | |
|                     if is_decorated(label):
 | |
|                         freqs[label] = freqs.get(label, 0) + 1
 | |
|             prepro_sents.append(
 | |
|                 ((ids, words, tags, proj_heads, deco_labels, iob), ctnts))
 | |
|         preprocessed.append((raw_text, prepro_sents))
 | |
|     if label_freq_cutoff > 0:
 | |
|         return _filter_labels(preprocessed, label_freq_cutoff, freqs)
 | |
|     return preprocessed
 | |
| 
 | |
| 
 | |
| def projectivize(heads, labels):
 | |
|     # Use the algorithm by Nivre & Nilsson 2005. Assumes heads to be a proper
 | |
|     # tree, i.e. connected and cycle-free. Returns a new pair (heads, labels)
 | |
|     # which encode a projective and decorated tree.
 | |
|     proj_heads = copy(heads)
 | |
|     smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
 | |
|     if smallest_np_arc is None:  # this sentence is already projective
 | |
|         return proj_heads, copy(labels)
 | |
|     while smallest_np_arc is not None:
 | |
|         _lift(smallest_np_arc, proj_heads)
 | |
|         smallest_np_arc = _get_smallest_nonproj_arc(proj_heads)
 | |
|     deco_labels = _decorate(heads, proj_heads, labels)
 | |
|     return proj_heads, deco_labels
 | |
| 
 | |
| 
 | |
| cpdef deprojectivize(Doc doc):
 | |
|     # Reattach arcs with decorated labels (following HEAD scheme). For each
 | |
|     # decorated arc X||Y, search top-down, left-to-right, breadth-first until
 | |
|     # hitting a Y then make this the new head.
 | |
|     for i in range(doc.length):
 | |
|         label = doc.vocab.strings[doc.c[i].dep]
 | |
|         if DELIMITER in label:
 | |
|             new_label, head_label = label.split(DELIMITER)
 | |
|             new_head = _find_new_head(doc[i], head_label)
 | |
|             doc.c[i].head = new_head.i - i
 | |
|             doc.c[i].dep = doc.vocab.strings.add(new_label)
 | |
|     set_children_from_heads(doc.c, doc.length)
 | |
|     return doc
 | |
| 
 | |
| 
 | |
| def _decorate(heads, proj_heads, labels):
 | |
|     # uses decoration scheme HEAD from Nivre & Nilsson 2005
 | |
|     if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
 | |
|         raise ValueError(Errors.E082.format(n_heads=len(heads),
 | |
|                                             n_proj_heads=len(proj_heads),
 | |
|                                             n_labels=len(labels)))
 | |
|     deco_labels = []
 | |
|     for tokenid, head in enumerate(heads):
 | |
|         if head != proj_heads[tokenid]:
 | |
|             deco_labels.append(
 | |
|                 '%s%s%s' % (labels[tokenid], DELIMITER, labels[head]))
 | |
|         else:
 | |
|             deco_labels.append(labels[tokenid])
 | |
|     return deco_labels
 | |
| 
 | |
| 
 | |
| def _get_smallest_nonproj_arc(heads):
 | |
|     # return the smallest non-proj arc or None
 | |
|     # where size is defined as the distance between dep and head
 | |
|     # and ties are broken left to right
 | |
|     smallest_size = float('inf')
 | |
|     smallest_np_arc = None
 | |
|     for tokenid, head in enumerate(heads):
 | |
|         size = abs(tokenid-head)
 | |
|         if size < smallest_size and is_nonproj_arc(tokenid, heads):
 | |
|             smallest_size = size
 | |
|             smallest_np_arc = tokenid
 | |
|     return smallest_np_arc
 | |
| 
 | |
| 
 | |
| def _lift(tokenid, heads):
 | |
|     # reattaches a word to it's grandfather
 | |
|     head = heads[tokenid]
 | |
|     ghead = heads[head]
 | |
|     # attach to ghead if head isn't attached to root else attach to root
 | |
|     heads[tokenid] = ghead if head != ghead else tokenid
 | |
| 
 | |
| 
 | |
| def _find_new_head(token, headlabel):
 | |
|     # search through the tree starting from the head of the given token
 | |
|     # returns the id of the first descendant with the given label
 | |
|     # if there is none, return the current head (no change)
 | |
|     queue = [token.head]
 | |
|     while queue:
 | |
|         next_queue = []
 | |
|         for qtoken in queue:
 | |
|             for child in qtoken.children:
 | |
|                 if child.is_space:
 | |
|                     continue
 | |
|                 if child == token:
 | |
|                     continue
 | |
|                 if child.dep_ == headlabel:
 | |
|                     return child
 | |
|                 next_queue.append(child)
 | |
|         queue = next_queue
 | |
|     return token.head
 | |
| 
 | |
| 
 | |
| def _filter_labels(gold_tuples, cutoff, freqs):
 | |
|     # throw away infrequent decorated labels
 | |
|     # can't learn them reliably anyway and keeps label set smaller
 | |
|     filtered = []
 | |
|     for raw_text, sents in gold_tuples:
 | |
|         filtered_sents = []
 | |
|         for (ids, words, tags, heads, labels, iob), ctnts in sents:
 | |
|             filtered_labels = []
 | |
|             for label in labels:
 | |
|                 if is_decorated(label) and freqs.get(label, 0) < cutoff:
 | |
|                     filtered_labels.append(decompose(label)[0])
 | |
|                 else:
 | |
|                     filtered_labels.append(label)
 | |
|             filtered_sents.append(
 | |
|                 ((ids, words, tags, heads, filtered_labels, iob), ctnts))
 | |
|         filtered.append((raw_text, filtered_sents))
 | |
|     return filtered
 |