From 2707d30ce087ddde711193192c33bd1416c625f0 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Wed, 19 Oct 2022 23:20:11 +0200 Subject: [PATCH] Intermediate state --- spacy/ml/richfeatureextractor.py | 41 +++-- spacy/tests/doc/test_doc_api.py | 52 ++++-- spacy/tests/test_util.py | 115 ++---------- spacy/tokens/doc.pxd | 39 ++-- spacy/tokens/doc.pyi | 6 +- spacy/tokens/doc.pyx | 303 ++++++++++++------------------- spacy/util.py | 98 ++++------ 7 files changed, 249 insertions(+), 405 deletions(-) diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index e76a92c86..91c7b9580 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -1,4 +1,5 @@ from typing import List, Optional, Callable, Tuple +from ..util import get_byte_arrays_for_search_chars from thinc.types import Ints2d from thinc.api import Model, registry @@ -16,22 +17,30 @@ def RichFeatureExtractor( suff_search_chars: Optional[str] = None, suff_search_lengths: Optional[List[int]] = None, ) -> Model[List[Doc], List[Ints2d]]: + if pref_search_chars is not None: + pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive) + else: + pref_search, pref_ref = bytes(), bytes() + if suff_search_chars is not None: + suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive) + else: + suff_search, suff_ref = bytes(), bytes() return Model( "extract_character_combination_hashes", forward, attrs={ "case_sensitive": case_sensitive, "pref_lengths": pref_lengths if pref_lengths is not None else [], - "pref_search_chars": pref_search_chars - if pref_search_chars is not None - else "", + "suff_lengths": suff_lengths if suff_lengths is not None else [], + "pref_search": pref_search, + "pref_ref": pref_ref, + "pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0, "pref_search_lengths": pref_search_lengths if pref_search_lengths is not None else [], - "suff_lengths": suff_lengths if suff_lengths is not None else [], - "suff_search_chars": suff_search_chars - if suff_search_chars is not None - else "", + "suff_search": suff_search, + "suff_ref": suff_ref, + "suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0, "suff_search_lengths": suff_search_lengths if suff_search_lengths is not None else [], @@ -45,10 +54,14 @@ def forward( ops = model.ops case_sensitive: bool = model.attrs["case_sensitive"] pref_lengths: List[int] = model.attrs["pref_lengths"] - pref_search_chars: str = model.attrs["pref_search_chars"] - pref_search_lengths: List[int] = model.attrs["pref_search_lengths"] suff_lengths: List[int] = model.attrs["suff_lengths"] - suff_search_chars: str = model.attrs["suff_search_chars"] + pref_search: bytes = model.attrs["pref_search"] + pref_ref: bytes = model.attrs["pref_ref"] + pref_s_char_l: int = model.attr["pref_s_char_l"] + pref_search_lengths: List[int] = model.attrs["pref_search_lengths"] + suff_search: bytes = model.attrs["suff_search"] + suff_ref: bytes = model.attrs["suff_ref"] + suff_s_char_l: int = model.attr["suff_s_char_l"] suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] features: List[Ints2d] = [] for doc in docs: @@ -56,9 +69,13 @@ def forward( case_sensitive=case_sensitive, pref_lengths=pref_lengths, suff_lengths=suff_lengths, - pref_search_chars=pref_search_chars, + pref_search=pref_search, + pref_ref=pref_ref, + pref_s_char_l=pref_s_char_l, pref_search_lengths=pref_search_lengths, - suff_search_chars=suff_search_chars, + suff_search=suff_search, + suff_ref=suff_ref, + suff_s_char_l=suff_s_char_l, suff_search_lengths=suff_search_lengths, ) features.append(ops.asarray2i(hashes)) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 14923d83b..d87cbdf5b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -14,6 +14,7 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token +from spacy.util import get_byte_arrays_for_search_chars from spacy.vocab import Vocab from .test_underscore import clean_underscore # noqa: F401 @@ -994,7 +995,8 @@ def test_doc_spans_setdefault(en_tokenizer): def _get_unsigned_32_bit_hash(input: str) -> int: - working_hash = hash(input.encode("UTF-16")[2:]) + input = input.replace(" ", "\x00") + working_hash = hash(input.encode("UTF-32LE")) if working_hash < 0: working_hash = working_hash + (2 << 31) return working_hash @@ -1004,15 +1006,21 @@ def _get_unsigned_32_bit_hash(input: str) -> int: def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): doc = en_tokenizer("spaCy✨ and Prodigy") + suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive) hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, + cs=case_sensitive, pref_lengths=[1, 4, 3], suff_lengths=[2, 3, 4, 5], - pref_search_chars="", + pref_search=bytes(), + pref_ref=bytes(), + pref_s_char_l = 0, pref_search_lengths=[2], - suff_search_chars="xx✨rp", - suff_search_lengths=[2, 1], + suff_search=suff_search, + suff_ref=suff_ref, + suff_s_char_l=5 if case_sensitive else 9, + suff_search_lengths=[2,1], ) + assert hashes[0][0] == _get_unsigned_32_bit_hash("s") assert hashes[0][1] == _get_unsigned_32_bit_hash( "spaC" if case_sensitive else "spac" @@ -1031,22 +1039,22 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][1] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][2] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][3] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][4] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][5] == _get_unsigned_32_bit_hash("✨") - assert hashes[1][6] == _get_unsigned_32_bit_hash("✨") + assert hashes[1][1] == _get_unsigned_32_bit_hash("✨ ") + assert hashes[1][2] == _get_unsigned_32_bit_hash("✨ ") + assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][4] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][5] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][6] == _get_unsigned_32_bit_hash(" ✨") assert hashes[1][7] == _get_unsigned_32_bit_hash(" ") assert hashes[1][8] == _get_unsigned_32_bit_hash("✨ ") assert hashes[1][9] == _get_unsigned_32_bit_hash("✨") assert hashes[2][0] == _get_unsigned_32_bit_hash("a") - assert hashes[2][1] == _get_unsigned_32_bit_hash("and") + assert hashes[2][1] == _get_unsigned_32_bit_hash("and ") assert hashes[2][2] == _get_unsigned_32_bit_hash("and") assert hashes[2][3] == _get_unsigned_32_bit_hash("nd") assert hashes[2][4] == _get_unsigned_32_bit_hash("and") - assert hashes[2][5] == _get_unsigned_32_bit_hash("and") - assert hashes[2][6] == _get_unsigned_32_bit_hash("and") + assert hashes[2][5] == _get_unsigned_32_bit_hash(" and") + assert hashes[2][6] == _get_unsigned_32_bit_hash(" and") assert hashes[2][7] == _get_unsigned_32_bit_hash(" ") assert hashes[2][8] == _get_unsigned_32_bit_hash(" ") assert hashes[2][9] == _get_unsigned_32_bit_hash(" ") @@ -1076,17 +1084,23 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer): doc = en_tokenizer("and𐌞") + suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True) hashes = doc.get_character_combination_hashes( - case_sensitive=True, + cs=True, pref_lengths=[], suff_lengths=[1, 2, 3], - pref_search_chars="", + pref_search=bytes(), + pref_ref=bytes(), + pref_s_char_l = 0, pref_search_lengths=[], - suff_search_chars="a", + suff_search=suff_search, + suff_ref=suff_ref, + suff_s_char_l=1, suff_search_lengths=[1], ) - assert hashes[0][1] == _get_unsigned_32_bit_hash("𐌞") - assert hashes[0][2] == _get_unsigned_32_bit_hash("d𐌞") + assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞") + assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞") + assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞") assert hashes[0][3] == _get_unsigned_32_bit_hash("a") diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 89e0ab1b7..33acbc3f2 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,138 +1,55 @@ -import sys import spacy -def test_get_byte_arrays_for_search_chars_width_1_not_case_sensitive(): - ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, - ) = spacy.util.get_byte_arrays_for_search_chars("bfEWfwe", False) - assert w1_search == b"BEFWbefw" - assert w2_search == b"B\x00E\x00F\x00W\x00b\x00e\x00f\x00w\x00" - assert ( - w4_search - == b"B\x00\x00\x00E\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00e\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" - ) - assert w1_finding == w2_finding == w4_finding == w4_search.lower() - - -def test_get_byte_arrays_for_search_chars_width_1_case_sensitive(): - ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, - ) = spacy.util.get_byte_arrays_for_search_chars("bfewT", True) - assert w1_search == b"Tbefw" - assert w2_search == b"T\x00b\x00e\x00f\x00w\00" - assert w4_search == b"T\x00\00\00b\x00\00\00e\x00\00\00f\x00\00\00w\00\00\00" - assert w1_finding == w2_finding == w4_finding == w4_search - - def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, + search, + ref, ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False) - assert w1_search == b"BFWbfw" assert ( - w1_finding - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" - ) - assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00" - assert ( - w2_finding - == w4_finding + ref == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" ) + assert ( - w4_search + search == b"B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" ) def test_get_byte_arrays_for_search_chars_width_2_case_sensitive(): ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, + search, + ref, ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True) - assert w1_search == b"bfw" - assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" - assert w2_search == b"b\x00f\x00w\x00\xe9\x00" assert ( - w2_finding - == w4_finding - == w4_search - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" + ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" ) def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive(): ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, + search, + ref, ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) - assert w1_search == b"BFWbfw" assert ( - w1_finding - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" - ) - - assert w2_search == b"B\x00F\x00W\x00b\x00f\x00w\x00\xc9\x00\xe9\x00" - - assert ( - w2_finding - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" - ) - - assert ( - w4_search + search == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" ) assert ( - w4_finding + ref == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" ) def test_get_byte_arrays_for_search_chars_width_4_case_sensitive(): ( - w1_search, - w1_finding, - w2_search, - w2_finding, - w4_search, - w4_finding, + search, + ref, ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) - assert w1_search == b"bfw" - assert w1_finding == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00" - assert w2_search == b"b\x00f\x00w\x00\xc9\x00\xe9\x00" + assert search == ref assert ( - w2_finding - == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" - ) - - assert w4_search == w4_finding - assert ( - w4_finding + ref == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" ) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index bf81b4d10..1d97d14c4 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -33,35 +33,26 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int [:,:] _get_lca_matrix(Doc, int start, int end) -cdef void _populate_affix_buf( - const void* str_data_ptr, - const unsigned int unicode_byte_width, - const int word_idx, - const int word_len, - Py_UCS4* affix_buf, +cdef void _populate_aff_buf( + const Py_UCS4* text_buf, + const int tok_idx, + const int tok_len, + Py_UCS4* aff_buf, const int pref_length, const int suff_length, const bint to_lower ) -cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes) - - -cdef bint _is_searched_char_in_search_chars_v( - const unsigned short searched_char, - const unsigned char[:] search_chars_v, - const unsigned int search_chars_v_len, -) - - -cdef void _set_found_char_buf( - const bint suffs_not_prefs, - const unsigned char[:] searched_string_v, - const unsigned int searched_string_len, - const unsigned char[:] search_chars_v, - const unsigned int search_chars_v_len, - char* found_char_buf, - const unsigned int found_char_buf_len, +cdef void _populate_search_buf( + const Py_UCS4* text_buf, + const int tok_idx, + const int tok_len, + Py_UCS4* search_buf, + Py_UCS4* ref_buf, + const int search_buf_len, + Py_UCS4* finding_buf, + const int finding_buf_len, + bint suffs_not_prefs ) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 6dcf87846..975367208 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -181,9 +181,13 @@ class Doc: pref_lengths: List[int], suff_lengths: List[int], pref_search_chars: str, + pref_ref_chars: str, + pref_search_char_length: int, pref_search_lengths: List[int], suff_search_chars: str, - suff_search_lengths: List[int] + suff_ref_chars: str, + suff_search_char_length: int, + suff_search_lengths: List[int], ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2d3ee5d17..c0d3890fd 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,7 +3,7 @@ from typing import Set, List cimport cython cimport numpy as np -from libc.string cimport memcpy +from libc.string cimport memcpy, memcmp, memset from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t @@ -42,12 +42,6 @@ from ..util import get_words_and_spaces DEF PADDING = 5 -cdef extern from *: - Py_UCS4 PyUnicode_READ(int kind, void *data, int index) - void* PyUnicode_DATA(void* o) - int PyUnicode_KIND(void *data) - Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch) - cdef int bounds_check(int i, int length, int padding) except -1: if (i + padding) < 0: raise IndexError(Errors.E026.format(i=i, length=length)) @@ -111,6 +105,16 @@ class SetEntsDefault(str, Enum): return list(cls.__members__.keys()) +cdef extern from "unicodeobject.h": + Py_UCS4 PyUnicode_READ(int kind, void *data, int index) + void* PyUnicode_DATA(void* o) + void PyUnicode_READY(void * o) + int PyUnicode_KIND(void *data) + int PyUnicode_IS_COMPACT(void *data) + + Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch) + + cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -1742,33 +1746,37 @@ cdef class Doc: def get_character_combination_hashes( - self, + self, *, - bint case_sensitive, + bint cs, pref_lengths: List[int], suff_lengths: List[int], - str pref_search_chars, + char* pref_search, + char* pref_ref, + int pref_s_char_l, pref_search_lengths: List[int], - str suff_search_chars, - suff_search_lengths: List[int] + char* suff_search, + char* suff_ref, + int suff_s_char_l, + suff_search_lengths: List[int], ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations derived from the string (text/orth) of each token. - case_sensitive: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that - if *case_sensitive==False*, upper-case characters in *search_chars* will not be found in token strings. + cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that + if *cs==False*, upper-case characters in *search_chars* will not be found in token strings. pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and *case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". pref_search_chars: a string containing characters to search for within each token, starting at the beginning. pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if - *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for + *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "a" and "ac". suff_search_chars: a string containing characters to search for within each token, starting at the end. suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if - *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *case_sensitive==False*, the searched strings hashed for + *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by @@ -1779,99 +1787,61 @@ cdef class Doc: [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] """ + cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0 + cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0 + cdef int aff_buf_l = max_pref_l + max_suff_l + cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0 + cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0 - cdef int longest_pref = max(pref_lengths) if len(pref_lengths) > 0 else 0 - cdef int longest_suff = max(suff_lengths) if len(suff_lengths) > 0 else 0 - cdef Py_UCS4* affix_buf = self.mem.alloc(4, longest_pref + longest_suff) - - cdef void* text_ptr = self.text - cdef void* text_data_ptr = PyUnicode_DATA(text_ptr) # todo change to const void - cdef unsigned int unicode_byte_width = PyUnicode_KIND(text_ptr), num_toks = len(self), tok_idx, token_idx, token_len - - cdef TokenC token_c - cdef str working_str - - for tok_idx in range(num_toks): - token_c = self.c[tok_idx] - token_idx = token_c.idx - token_len = token_c.lex.length - _populate_affix_buf( - text_data_ptr, - unicode_byte_width, - token_idx, - token_len, - affix_buf, - longest_pref, - longest_suff, - not case_sensitive - ) - - - cdef const unsigned char[:] pref_search_chars_v = _get_utf16_memoryview(pref_search_chars, True) - cdef const unsigned char[:] suff_search_chars_v = _get_utf16_memoryview(suff_search_chars, True) - cdef unsigned int longest_search_length = max(pref_search_lengths + suff_search_lengths) if len(pref_search_lengths + suff_search_lengths) > 0 else 0 - cdef bytes found_char_buf_bytes = (bytes(" " * longest_search_length, "UTF-16"))[2:] # first two bytes express endianness - cdef char* found_char_buf = found_char_buf_bytes - cdef unsigned int pref_search_chars_v_len = len(pref_search_chars_v), suff_search_chars_v_len = len(suff_search_chars_v), - cdef unsigned int found_char_buf_len = len(found_char_buf_bytes) + cdef Py_UCS4* aff_buf = self.mem.alloc(4, aff_buf_l) + cdef Py_UCS4* pref_s_buf = pref_search + cdef Py_UCS4* pref_r_buf = pref_ref + cdef Py_UCS4* pref_f_buf = self.mem.alloc(4, max_s_pref_l) + cdef Py_UCS4* suff_s_buf = suff_search + cdef Py_UCS4* suff_r_buf = suff_ref + cdef Py_UCS4* suff_f_buf = self.mem.alloc(4, max_s_suff_l) - cdef unsigned int num_pref_norm_hashes = len(pref_lengths), num_suff_norm_hashes = len(suff_lengths) - cdef unsigned int num_pref_search_hashes = len(pref_search_lengths) - cdef unsigned int num_suff_search_hashes = len(suff_search_lengths) - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_norm_hashes), dtype="int64") + cdef bytes encoded_text = self.text.encode("utf-32le") + cdef char* intermediate_text = encoded_text + cdef Py_UCS4* text_buf = intermediate_text - cdef const unsigned char[:] tok_str_v - cdef unsigned int tok_str_v_len, hash_idx, affix_start, char_comb_len - cdef attr_t num_tok_attr - cdef str str_tok_attr + cdef unsigned int num_toks = len(self), aff_len + cdef unsigned int h_pref_n = len(pref_lengths) + cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths) + cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n + cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64") - for tok_idx in range(num_toks): - num_tok_attr = self.c[tok_idx].lex.orth if case_sensitive else self.c[tok_idx].lex.lower - str_tok_attr = self.vocab.strings[num_tok_attr] - tok_str_v = _get_utf16_memoryview(str_tok_attr, False) - tok_str_v_len = len(tok_str_v) + cdef TokenC tok_c - for hash_idx in range(num_pref_norm_hashes): - char_comb_len = pref_lengths[hash_idx] * 2 - if char_comb_len > tok_str_v_len: - char_comb_len = tok_str_v_len - hashes[tok_idx, hash_idx] = hash32( &tok_str_v[0], char_comb_len, 0) - - for hash_idx in range(num_pref_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes): - char_comb_len = suff_lengths[hash_idx - num_pref_norm_hashes] * 2 - if char_comb_len > tok_str_v_len: - char_comb_len = tok_str_v_len - affix_start = tok_str_v_len - char_comb_len - hashes[tok_idx, hash_idx] = hash32( &tok_str_v[affix_start], char_comb_len, 0) - - _set_found_char_buf( - False, - tok_str_v, - tok_str_v_len, - pref_search_chars_v, - pref_search_chars_v_len, - found_char_buf, - found_char_buf_len, - ) + for tok_i in range(num_toks): + tok_c = self.c[tok_i] + tok_idx = tok_c.idx + tok_len = tok_c.lex.length - for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes): - char_comb_len = pref_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes)] * 2 - hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) + _populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs) + _populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False) + _populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True) + + for hash_idx in range(h_pref_n): + aff_len = pref_lengths[hash_idx] + hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0) - _set_found_char_buf( - True, - tok_str_v, - tok_str_v_len, - suff_search_chars_v, - suff_search_chars_v_len, - found_char_buf, - found_char_buf_len, - ) + for hash_idx in range(h_pref_n, h_suff_end_idx): + aff_len = suff_lengths[hash_idx - h_pref_n] + hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0) + + for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx): + aff_len = pref_search_lengths[hash_idx - h_suff_end_idx] + hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0) - for hash_idx in range(num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes, num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes + num_suff_search_hashes): - char_comb_len = suff_search_lengths[hash_idx - (num_pref_norm_hashes + num_suff_norm_hashes + num_pref_search_hashes)] * 2 - hashes[tok_idx, hash_idx] = hash32(found_char_buf, char_comb_len, 0) + for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx): + aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx] + hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0) + self.mem.free(aff_buf) + self.mem.free(pref_f_buf) + self.mem.free(suff_f_buf) return hashes @staticmethod @@ -2055,12 +2025,11 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): return lca_matrix -cdef void _populate_affix_buf( - const void* str_data_ptr, - const unsigned int unicode_byte_width, - const int word_idx, - const int word_len, - Py_UCS4* affix_buf, +cdef void _populate_aff_buf( + const Py_UCS4* text_buf, + const int tok_idx, + const int tok_len, + Py_UCS4* aff_buf, const int pref_length, const int suff_length, const bint to_lower @@ -2070,107 +2039,75 @@ cdef void _populate_affix_buf( str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical Unicode form (see PEP 393). - unicode_byte_width: the number of bytes occupied by each character in the containing string. + kind: the number of bytes occupied by each character in the containing string. word_idx: the index of the first character of the word within the containing string. word_len: the length of the word. - affix_buf: the buffer to populate. + aff_buf: the buffer to populate. pref_length: the length of the prefix. suff_length: the length of the suffix. to_lower: if *True*, any upper case characters in either affix are converted to lower case. """ - cdef int affix_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx - cdef Py_UCS4 working_wchar + cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx + + while aff_buf_idx < pref_length and aff_buf_idx < tok_len: - while affix_buf_idx < pref_length and affix_buf_idx < word_len: - working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, in_word_idx) + memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4) if to_lower: - working_wchar = Py_UNICODE_TOLOWER(working_wchar) - memcpy(affix_buf + affix_buf_idx, &working_wchar, 4) - affix_buf_idx += 1 + aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx]) + aff_buf_idx += 1 - while (affix_buf_idx < buf_size - suff_length) or (affix_buf_idx < buf_size - word_len): + if aff_buf_idx < buf_size - tok_len: # fill out the empty middle part of the buffer with zeros - affix_buf[affix_buf_idx] = 0 - affix_buf_idx += 1 + memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx) - while affix_buf_idx < buf_size: - in_word_idx = affix_buf_idx + word_len - buf_size + while aff_buf_idx < buf_size: + in_word_idx = aff_buf_idx + tok_len - buf_size # for suffixes we have to track the in-word index separately from the in-buffer index if in_word_idx < pref_length: # we've already retrieved this character as part of the prefix, so copy it from there # as that's quicker than retrieving it from the input string a second time - memcpy(affix_buf + affix_buf_idx, affix_buf + in_word_idx, 4) + memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4) else: - working_wchar = PyUnicode_READ(unicode_byte_width, str_data_ptr, word_idx + in_word_idx) + memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4) if to_lower: - working_wchar = Py_UNICODE_TOLOWER(working_wchar) - memcpy(affix_buf + affix_buf_idx, &working_wchar, 4) - affix_buf_idx += 1 + aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx]) + aff_buf_idx += 1 - - -cdef const unsigned char[:] _get_utf16_memoryview(str unicode_string, const bint check_2_bytes): - """ - Return a memory view of the UTF-16 representation of a string with the default endianness of the platform. - Throw a ValueError if *check_2_bytes == True* and one or more characters in the UTF-16 representation - occupies four bytes rather than two. - """ - cdef const unsigned char[:] view = unicode_string.encode("UTF-16") - view = view[2:] # first two bytes express endianness - cdef unsigned int unicode_len, view_len - if check_2_bytes: - unicode_len = len(unicode_string) - view_len = len(view) - if unicode_len * 2 != view_len: - raise ValueError(Errors.E1046) - return view - - -cdef bint _is_searched_char_in_search_chars_v( - const unsigned short searched_char, - const unsigned char[:] search_chars_v, - const unsigned int search_chars_v_len +cdef void _populate_search_buf( + const Py_UCS4* text_buf, + const int tok_idx, + const int tok_len, + Py_UCS4* search_buf, + Py_UCS4* ref_buf, + const int search_buf_len, + Py_UCS4* finding_buf, + const int finding_buf_len, + bint suffs_not_prefs ): - cdef unsigned int search_chars_v_idx = 0 - while search_chars_v_idx < search_chars_v_len: - if searched_char == ( &search_chars_v[search_chars_v_idx])[0]: - return True - search_chars_v_idx += 2 - return False + cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx + cdef unsigned int search_buf_idx + cdef int cmp_res - -cdef void _set_found_char_buf( - const bint suffs_not_prefs, - const unsigned char[:] searched_string_v, - const unsigned int searched_string_v_len, - const unsigned char[:] search_chars_v, - const unsigned int search_chars_v_len, - char* found_char_buf, - const unsigned int found_char_buf_len, -): - """ Pick the UTF-16 characters from *searched_string_v* that are also in *search_chars_v* and writes them in order to *found_char_buf*. - If *suffs_not_prefs*, the search starts from the end of *searched_string_v* rather than from the beginning. - """ - cdef unsigned int found_char_buf_idx = 0, searched_string_idx = searched_string_v_len - 2 if suffs_not_prefs else 0 - cdef unsigned short searched_char, SPACE = 32 - - while found_char_buf_idx < found_char_buf_len: - searched_char = ( &searched_string_v[searched_string_idx])[0] - if _is_searched_char_in_search_chars_v(searched_char, search_chars_v, search_chars_v_len): - memcpy(found_char_buf + found_char_buf_idx, &searched_char, 2) - found_char_buf_idx += 2 - if suffs_not_prefs: - if searched_string_idx <= 0: + while finding_buf_idx < finding_buf_len: + for search_buf_idx in range (search_buf_len): + cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4) + if cmp_res == 0: + memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4) + finding_buf_idx += 1 + if cmp_res >= 0: break - searched_string_idx -= 2 + if suffs_not_prefs: + if text_string_idx <= tok_idx: + break + text_string_idx -= 1 else: - searched_string_idx += 2 - if searched_string_idx >= searched_string_v_len: + text_string_idx += 1 + if text_string_idx >= tok_idx + tok_len: break - while found_char_buf_idx < found_char_buf_len: - memcpy(found_char_buf + found_char_buf_idx, &SPACE, 2) - found_char_buf_idx += 2 + if finding_buf_idx < finding_buf_len: + memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx) + def pickle_doc(doc): diff --git a/spacy/util.py b/spacy/util.py index 7a9a5e7ee..89e51118e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1739,104 +1739,68 @@ def all_equal(iterable): def get_byte_arrays_for_search_chars( search_chars: str, case_sensitive: bool -) -> Tuple[bytes, bytes, bytes, bytes, bytes, bytes]: +) -> Tuple[bytes, bytes]: """ - The text of a spaCy document is stored as a Python-internal Unicode representation - as defined by PEP 393. Each character in such a representation has the width of the - longest character in the string, which is either 1, 2 or 4 bytes. - This function supports the rich feature extractor. It returns search byte arrays with - 1-, 2- and 4-byte character widths that are used for comparison with each of the three - representation types when searching document texts for search characters. Each byte array - contains characters that are as wide or narrower than its own width; a byte array can - ignore characters that are wider than its own width because a spaCy document with the - corresponding representation width could never contain characters wider than that width. + 4-byte character width that are used for comparison when searching document texts + for search characters. The encoding is little-endian regardless of architecture, as + this is what is expected by the murmurhash library used downstream. - When characters corresponding to search characters are found within a spaCy token - string, they are concatenated together and the resulting "finding byte arrays" are hashed. - It is crucial that the characters in all finding byte arrays representing a given sequence of - characters share the same width so that they all yield the same hash values. While it - would be possible to use the narrowest possible width for the sequence like PEP 393 does, - determining this would entain unnecessary processing. Instead, finding byte arrays always use - a 4-byte width. Each of the three search byte array therefore has a corresponding finding - byte array that is used to build up the finding byte arrays for specific document token strings. - - If *case_sensitive==False*, the lower- or uppercase counterparts of any characters that + Alongside the "search byte array" against which words from document texts are compared + is the "ref byte array". When a character from the search byte array is matched, + the character at the corresponding position in the ref byte array is added to the + byte sequence of the configured length that is then hashed. This enables case-sensitivity + to be handled without converting the case of the words being searched: if + *case_sensitive==False*, the lower- or uppercase counterparts of any characters that have case are added to the search byte arrays, and both the original character and its - other-cased counterpart map to the lower-case version in the finding byte array. - - All encodings are little-endian regardless of architecture, as this is what is expected by the - murmurhash library used downstream. + other-cased counterpart map to the lower-case version in the ref byte array. """ - def encode(ch: str, width: int) -> bytes: + def encode(ch: str) -> bytes: """ ch: a single character - int: the width of the character encoding to use """ - if width == 4: - return ch.encode("UTF-32LE") - elif width == 2: - return ch.encode("UTF-16LE") - else: - return ch.encode("UTF-8") - + return ch.encode("UTF-32LE") + def add_to_byte_arrays( - search: List[bytes], finding: List[bytes], ch: str, width: int + search: List[bytes], ref: List[bytes], ch: str ) -> None: - """Add the byte representations of *ch* with representation of width - *width* to the two byte array lists. + """Add the byte representations of *ch* to the two byte array lists. """ - this_char_bytes = encode(ch, width) - this_char_bytes_f = encode(ch, 4) + this_char_bytes = encode(ch) if not case_sensitive and ch.islower(): if this_char_bytes not in search: search.append(this_char_bytes) - finding.append(this_char_bytes_f) - upper_char_bytes = encode(ch.upper(), width) + ref.append(this_char_bytes) + upper_char_bytes = encode(ch.upper()) if upper_char_bytes not in search: search.append(upper_char_bytes) - finding.append(this_char_bytes_f) + ref.append(this_char_bytes) elif not case_sensitive and ch.isupper(): - lower_char_bytes = encode(ch.lower(), width) - lower_char_bytes_f = encode(ch.lower(), 4) + lower_char_bytes = encode(ch.lower()) if this_char_bytes not in search: search.append(this_char_bytes) - finding.append(lower_char_bytes_f) + ref.append(lower_char_bytes) if lower_char_bytes not in search: search.append(lower_char_bytes) - finding.append(lower_char_bytes_f) + ref.append(lower_char_bytes) elif this_char_bytes not in search: search.append(this_char_bytes) - finding.append(this_char_bytes_f) + ref.append(this_char_bytes) def get_ordered_raw_bytes( - search: List[bytes], finding: List[bytes] + search: List[bytes], ref: List[bytes] ) -> Tuple[bytes, bytes]: """Flatten the two lists, ordering both by the entries in *search* using the native endianness of the platform. """ num_search = [list(entry) for entry in search] search = [entry for _, entry in sorted(zip(num_search, search))] - finding = [entry for _, entry in sorted(zip(num_search, finding))] - return b"".join(search), b"".join(finding) + ref = [entry for _, entry in sorted(zip(num_search, ref))] + return b"".join(search), b"".join(ref) - w1_search: List[bytes] = [] - w1_finding: List[bytes] = [] - w2_search: List[bytes] = [] - w2_finding: List[bytes] = [] - w4_search: List[bytes] = [] - w4_finding: List[bytes] = [] + search: List[bytes] = [] + ref: List[bytes] = [] for ch in search_chars: - add_to_byte_arrays(w4_search, w4_finding, ch, 4) - if ord(ch) >= 65536: - continue - add_to_byte_arrays(w2_search, w2_finding, ch, 2) - if ord(ch) >= 128: - continue - add_to_byte_arrays(w1_search, w1_finding, ch, 1) - return ( - get_ordered_raw_bytes(w1_search, w1_finding) - + get_ordered_raw_bytes(w2_search, w2_finding) - + get_ordered_raw_bytes(w4_search, w4_finding) - ) + add_to_byte_arrays(search, ref, ch) + return get_ordered_raw_bytes(search, ref)