mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add alignment_mode argument to Span.char_span() (#12145)
* Add alignment_mode argument to Span.char_span() * Update website * Update spacy/tokens/span.pyx Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> * Add test Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
This commit is contained in:
		
							parent
							
								
									c68e6b8a96
								
							
						
					
					
						commit
						774c10fa39
					
				|  | @ -367,6 +367,14 @@ def test_spans_by_character(doc): | |||
|             span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk" | ||||
|         ) | ||||
| 
 | ||||
|     # Span.char_span + alignment mode "contract" | ||||
|     span2 = doc[0:2].char_span( | ||||
|         span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract" | ||||
|     ) | ||||
|     assert span1.start_char == span2.start_char | ||||
|     assert span1.end_char == span2.end_char | ||||
|     assert span2.label_ == "GPE" | ||||
| 
 | ||||
| 
 | ||||
| def test_span_to_array(doc): | ||||
|     span = doc[1:-2] | ||||
|  |  | |||
|  | @ -98,6 +98,7 @@ class Span: | |||
|         label: Union[int, str] = ..., | ||||
|         kb_id: Union[int, str] = ..., | ||||
|         vector: Optional[Floats1d] = ..., | ||||
|         alignment_mode: str = ..., | ||||
|     ) -> Span: ... | ||||
|     @property | ||||
|     def conjuncts(self) -> Tuple[Token]: ... | ||||
|  |  | |||
|  | @ -362,7 +362,7 @@ cdef class Span: | |||
|         result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm) | ||||
|         # ensure we get a scalar back (numpy does this automatically but cupy doesn't) | ||||
|         return result.item() | ||||
|      | ||||
| 
 | ||||
|     cpdef np.ndarray to_array(self, object py_attr_ids): | ||||
|         """Given a list of M attribute IDs, export the tokens to a numpy | ||||
|         `ndarray` of shape `(N, M)`, where `N` is the length of the document. | ||||
|  | @ -639,7 +639,7 @@ cdef class Span: | |||
|         else: | ||||
|             return self.doc[root] | ||||
| 
 | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0): | ||||
|     def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict"): | ||||
|         """Create a `Span` object from the slice `span.text[start : end]`. | ||||
| 
 | ||||
|         start (int): The index of the first character of the span. | ||||
|  | @ -649,11 +649,16 @@ cdef class Span: | |||
|         kb_id (uint64 or string):  An ID from a KB to capture the meaning of a named entity. | ||||
|         vector (ndarray[ndim=1, dtype='float32']): A meaning representation of | ||||
|             the span. | ||||
|         alignment_mode (str): How character indices are aligned to token | ||||
|             boundaries. Options: "strict" (character indices must be aligned | ||||
|             with token boundaries), "contract" (span of all tokens completely | ||||
|             within the character span), "expand" (span of all tokens at least | ||||
|             partially covered by the character span). Defaults to "strict". | ||||
|         RETURNS (Span): The newly constructed object. | ||||
|         """ | ||||
|         start_idx += self.c.start_char | ||||
|         end_idx += self.c.start_char | ||||
|         return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector) | ||||
|         return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode) | ||||
| 
 | ||||
|     @property | ||||
|     def conjuncts(self): | ||||
|  |  | |||
|  | @ -186,14 +186,15 @@ the character indices don't map to a valid span. | |||
| > assert span.text == "New York" | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Description                                                                               | | ||||
| | ----------- | ----------------------------------------------------------------------------------------- | | ||||
| | `start`     | The index of the first character of the span. ~~int~~                                     | | ||||
| | `end`       | The index of the last character after the span. ~~int~~                                   | | ||||
| | `label`     | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~               | | ||||
| | `kb_id`     | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | ||||
| | `vector`    | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~            | | ||||
| | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~                                | | ||||
| | Name                                            | Description                                                                                                                                                                                                                                                                  | | ||||
| | ----------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `start`                                         | The index of the first character of the span. ~~int~~                                                                                                                                                                                                                        | | ||||
| | `end`                                           | The index of the last character after the span. ~~int~~                                                                                                                                                                                                                      | | ||||
| | `label`                                         | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~                                                                                                                                                                                                  | | ||||
| | `kb_id`                                         | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~                                                                                                                                                                                    | | ||||
| | `vector`                                        | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~                                                                                                                                                                                               | | ||||
| | `alignment_mode` <Tag variant="new">3.5.1</Tag> | How character indices snap to token boundaries. Options: `"strict"` (no snapping), `"contract"` (span of all tokens completely within the character span), `"expand"` (span of all tokens at least partially covered by the character span). Defaults to `"strict"`. ~~str~~ | | ||||
| | **RETURNS**                                     | The newly constructed object or `None`. ~~Optional[Span]~~                                                                                                                                                                                                                   | | ||||
| 
 | ||||
| ## Span.similarity {id="similarity",tag="method",model="vectors"} | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user