From e227d24d4356ddd6ffc0b997a092c02225bbc3e3 Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Tue, 10 Aug 2021 22:13:53 +0900 Subject: [PATCH] Allow passing in array vars for speedup (#8882) * Allow passing in array vars for speedup This fixes #8845. Not sure about the docstring changes here... * Update docs Types maybe need more detail? Maybe not? * Run prettier on docs * Update spacy/tokens/span.pyx Co-authored-by: Sofie Van Landeghem --- spacy/tokens/span.pyx | 10 +++++++--- website/docs/api/span.md | 14 ++++++++++---- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 48c6053c1..7087ba986 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -218,10 +218,12 @@ cdef class Span: return Underscore(Underscore.span_extensions, self, start=self.c.start_char, end=self.c.end_char) - def as_doc(self, *, bint copy_user_data=False): + def as_doc(self, *, bint copy_user_data=False, array_head=None, array=None): """Create a `Doc` object with a copy of the `Span`'s data. copy_user_data (bool): Whether or not to copy the original doc's user data. + array_head (tuple): `Doc` array attrs, can be passed in to speed up computation. + array (ndarray): `Doc` as array, can be passed in to speed up computation. RETURNS (Doc): The `Doc` copy of the span. DOCS: https://spacy.io/api/span#as_doc @@ -229,8 +231,10 @@ cdef class Span: words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) - array_head = self.doc._get_array_attrs() - array = self.doc.to_array(array_head) + if array_head is None: + array_head = self.doc._get_array_attrs() + if array is None: + array = self.doc.to_array(array_head) array = array[self.start : self.end] self._fix_dep_copy(array_head, array) # Fix initial IOB so the entities are valid for doc.ents below. diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 9212f957d..48e310979 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -303,6 +303,10 @@ not been implemeted for the given language, a `NotImplementedError` is raised. Create a new `Doc` object corresponding to the `Span`, with a copy of the data. +When calling this on many spans from the same doc, passing in a precomputed +array representation of the doc using the `array_head` and `array` args can save +time. + > #### Example > > ```python @@ -312,10 +316,12 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data. > assert doc2.text == "New York" > ``` -| Name | Description | -| ---------------- | ------------------------------------------------------------- | -| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ | -| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ | +| Name | Description | +| ---------------- | -------------------------------------------------------------------------------------------------------------------- | +| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ | +| `array_head` | Precomputed array attributes (headers) of the original doc, as generated by `Doc._get_array_attrs()`. ~~Tuple~~ | +| `array` | Precomputed array version of the original doc as generated by [`Doc.to_array`](/api/doc#to_array). ~~numpy.ndarray~~ | +| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ | ## Span.root {#root tag="property" model="parser"}