mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	* Rename Span.head to Span.root.
This commit is contained in:
		
							parent
							
								
									c0255ed7d8
								
							
						
					
					
						commit
						53d1f5b2eb
					
				|  | @ -5,6 +5,10 @@ from collections import defaultdict | ||||||
| cdef class Span: | cdef class Span: | ||||||
|     """A slice from a Doc object.""" |     """A slice from a Doc object.""" | ||||||
|     def __cinit__(self, Doc tokens, int start, int end, int label=0): |     def __cinit__(self, Doc tokens, int start, int end, int label=0): | ||||||
|  |         if start < 0: | ||||||
|  |             start = tokens.length - start | ||||||
|  |         if end < 0: | ||||||
|  |             end = tokens.length - end | ||||||
|         self._seq = tokens |         self._seq = tokens | ||||||
|         self.start = start |         self.start = start | ||||||
|         self.end = end |         self.end = end | ||||||
|  | @ -37,20 +41,48 @@ cdef class Span: | ||||||
|         for i in range(self.start, self.end): |         for i in range(self.start, self.end): | ||||||
|             yield self._seq[i] |             yield self._seq[i] | ||||||
| 
 | 
 | ||||||
|     property head: |     property root: | ||||||
|         """The highest Token in the dependency tree in the Span, or None if |         """The first ancestor of the first word of the span that has its head | ||||||
|         the Span is not internally connected (i.e. there are multiple heads). |         outside the span. | ||||||
|  |          | ||||||
|  |         For example: | ||||||
|  |          | ||||||
|  |         >>> toks = nlp(u'I like New York in Autumn.') | ||||||
|  | 
 | ||||||
|  |         Let's name the indices --- easier than writing "toks[4]" etc. | ||||||
|  | 
 | ||||||
|  |         >>> i, like, new, york, in_, autumn, dot = range(len(toks))  | ||||||
|  | 
 | ||||||
|  |         The head of 'new' is 'York', and the head of 'York' is 'like' | ||||||
|  | 
 | ||||||
|  |         >>> toks[new].head.orth_ | ||||||
|  |         'York' | ||||||
|  |         >>> toks[york].head.orth_ | ||||||
|  |         'like' | ||||||
|  | 
 | ||||||
|  |         Create a span for "New York". Its root is "York". | ||||||
|  | 
 | ||||||
|  |         >>> new_york = toks[new:york+1] | ||||||
|  |         >>> new_york.root.orth_ | ||||||
|  |         'York' | ||||||
|  | 
 | ||||||
|  |         When there are multiple words with external dependencies, we take the first: | ||||||
|  | 
 | ||||||
|  |         >>> toks[autumn].head.orth_, toks[dot].head.orth_ | ||||||
|  |         ('in', like') | ||||||
|  |         >>> autumn_dot = toks[autumn:] | ||||||
|  |         >>> autumn_dot.root.orth_ | ||||||
|  |         'Autumn' | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             heads = [] |             # This should probably be called 'head', and the other one called | ||||||
|             for token in self: |             # 'gov'. But we went with 'head' elsehwhere, and now we're stuck =/ | ||||||
|                 head_i = token.head.i |             cdef const TokenC* start = &self._seq.data[self.start] | ||||||
|                 if token.head is token or head_i >= self.end or head_i < self.start: |             cdef const TokenC* end = &self._seq.data[self.end] | ||||||
|                     heads.append(token) |             head = start | ||||||
|             if len(heads) != 1: |             while start <= (head + head.head) < end and head.head != 0: | ||||||
|                 return None |                 head += head.head | ||||||
|             else: |             return self[head - self._seq.data] | ||||||
|                 return heads[0] |  | ||||||
| 
 | 
 | ||||||
|     property lefts: |     property lefts: | ||||||
|         """Tokens that are to the left of the Span, whose head is within the Span.""" |         """Tokens that are to the left of the Span, whose head is within the Span.""" | ||||||
|  | @ -68,6 +100,14 @@ cdef class Span: | ||||||
|                     if right.i >= self.end: |                     if right.i >= self.end: | ||||||
|                         yield right |                         yield right | ||||||
| 
 | 
 | ||||||
|  |     property subtree: | ||||||
|  |         def __get__(self): | ||||||
|  |             for word in self.lefts: | ||||||
|  |                 yield from word.subtree | ||||||
|  |             yield from self | ||||||
|  |             for word in self.rights: | ||||||
|  |                 yield from word.subtree | ||||||
|  | 
 | ||||||
|     property orth_: |     property orth_: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return ''.join([t.string for t in self]).strip() |             return ''.join([t.string for t in self]).strip() | ||||||
|  | @ -84,10 +124,3 @@ cdef class Span: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             return self._seq.vocab.strings[self.label] |             return self._seq.vocab.strings[self.label] | ||||||
| 
 | 
 | ||||||
|     property subtree: |  | ||||||
|         def __get__(self): |  | ||||||
|             for word in self.lefts: |  | ||||||
|                 yield from word.subtree |  | ||||||
|             yield from self |  | ||||||
|             for word in self.rights: |  | ||||||
|                 yield from word.subtree |  | ||||||
|  |  | ||||||
|  | @ -5,8 +5,8 @@ import pytest | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @pytest.fixture | @pytest.fixture | ||||||
| def doc(en_nlp): | def doc(EN): | ||||||
|     return en_nlp('This is a sentence. This is another sentence. And a third.') |     return EN('This is a sentence. This is another sentence. And a third.') | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def test_sent_spans(doc): | def test_sent_spans(doc): | ||||||
|  | @ -15,3 +15,11 @@ def test_sent_spans(doc): | ||||||
|     assert sents[0].end == 5 |     assert sents[0].end == 5 | ||||||
|     assert len(sents) == 3 |     assert len(sents) == 3 | ||||||
|     assert sum(len(sent) for sent in sents) == len(doc) |     assert sum(len(sent) for sent in sents) == len(doc) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_root(doc): | ||||||
|  |     np = doc[2:4] | ||||||
|  |     assert len(np) == 2 | ||||||
|  |     assert np.orth_ == 'a sentence' | ||||||
|  |     assert np.root.orth_ == 'sentence' | ||||||
|  |     assert nlp.root.head.orth_ == 'is' | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user