From f7d9942e7c4d5b191568246422b5e6ae072379e8 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Thu, 20 Oct 2022 21:48:53 +0200 Subject: [PATCH] Intermediate state --- spacy/errors.py | 1 - spacy/ml/models/tok2vec.py | 21 --- spacy/ml/richfeatureextractor.py | 85 +++++---- spacy/tests/doc/test_doc_api.py | 165 +++++++++-------- spacy/tests/test_util.py | 34 ++-- spacy/tokens/doc.pxd | 28 ++- spacy/tokens/doc.pyi | 16 +- spacy/tokens/doc.pyx | 294 +++++++++++++++++------------- spacy/util.py | 40 ++-- website/docs/api/architectures.md | 22 --- 10 files changed, 377 insertions(+), 329 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 5f93dac23..e472ff363 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -946,7 +946,6 @@ class Errors(metaclass=ErrorsWithCodes): "{value}.") E1044 = ("Search characters for '{label}' may not contain upper-case chars where case_sensitive==False.") E1045 = ("Invalid rich group config '{label}'.") - E1046 = ("Search characters may not contain characters that occupy four bytes in UTF-16.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index ba1851370..bd67613e1 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -260,27 +260,6 @@ def RichMultiHashEmbed( prevents the alternation from occurring, e.g. an `ä` in a German plural noun does not become `a` if it is the third or fourth vowel from the end of the word. - Internally, the model converts each token string to UTF-16 and assumes that each - character from the string occupies two bytes. This assumption holds for all - characters in the Basic Multilingual Plane, which encompasses all characters that - are ever likely to be of interest when extracting features. There are, however, - characters like emojis that are in the Extended Multilingual Plane and occupy - four bytes, although importantly neither of the two byte pairs that make up such - a representation can be a valid two-byte character in its own right. The - following considerations apply to the processing of four-byte characters: - - - An exceptional four-byte character within a text consisting mostly of two-byte - characters will probably be ignored by the neural network accepting the - embedding layer as not matching any of the learned features. - - If anyone did want to train a model for a language like Lycian that is - generally written in four-byte characters, prefix and suffix features can - still be extracted, but the length specifications should all be doubled, i.e. - `[2,4,6]` to extract one-, two- and three-character affixes. In such a - situation length specifications that are odd numbers would serve no useful - purpose since they would refer to half-characters. - - Four-byte characters are not accepted within search character specification - strings and lead to an error being thrown. - width (int): The output width. Also used as the width of the embedding tables. Recommended values are between 64 and 300. attrs (list of attr IDs): The token attributes to embed. A separate diff --git a/spacy/ml/richfeatureextractor.py b/spacy/ml/richfeatureextractor.py index 91c7b9580..a31242a9e 100644 --- a/spacy/ml/richfeatureextractor.py +++ b/spacy/ml/richfeatureextractor.py @@ -1,7 +1,7 @@ from typing import List, Optional, Callable, Tuple -from ..util import get_byte_arrays_for_search_chars -from thinc.types import Ints2d -from thinc.api import Model, registry +from ..util import get_arrays_for_search_chars +from thinc.types import Ints1d, Ints2d +from thinc.api import Model, registry, get_current_ops from ..tokens import Doc @@ -17,33 +17,46 @@ def RichFeatureExtractor( suff_search_chars: Optional[str] = None, suff_search_lengths: Optional[List[int]] = None, ) -> Model[List[Doc], List[Ints2d]]: + ops = get_current_ops() if pref_search_chars is not None: - pref_search, pref_ref = get_byte_arrays_for_search_chars(pref_search_chars, case_sensitive) + pref_search, pref_lookup = get_arrays_for_search_chars( + pref_search_chars, case_sensitive + ) else: - pref_search, pref_ref = bytes(), bytes() + pref_search, pref_lookup = bytes(), bytes() if suff_search_chars is not None: - suff_search, suff_ref = get_byte_arrays_for_search_chars(suff_search_chars, case_sensitive) + suff_search, suff_lookup = get_arrays_for_search_chars( + suff_search_chars, case_sensitive + ) else: - suff_search, suff_ref = bytes(), bytes() + suff_search, suff_lookup = bytes(), bytes() return Model( "extract_character_combination_hashes", forward, attrs={ "case_sensitive": case_sensitive, - "pref_lengths": pref_lengths if pref_lengths is not None else [], - "suff_lengths": suff_lengths if suff_lengths is not None else [], + "pref_lengths": ops.asarray1i(pref_lengths) + if pref_lengths is not None + else ops.asarray1i([]), + "suff_lengths": ops.asarray1i(suff_lengths) + if suff_lengths is not None + else ops.asarray1i([]), "pref_search": pref_search, - "pref_ref": pref_ref, - "pref_s_char_l": len(pref_search) / 4 if pref_search_chars is not None else 0, - "pref_search_lengths": pref_search_lengths + "pref_lookup": pref_lookup, + "pref_search_char_len": len(pref_search) / 4 + if pref_search_chars is not None + else 0, + "pref_search_lengths": ops.asarray1i(pref_search_lengths) if pref_search_lengths is not None - else [], + else ops.asarray1i([]), "suff_search": suff_search, - "suff_ref": suff_ref, - "suff_s_char_l": len(suff_search) / 4 if suff_search_chars is not None else 0, - "suff_search_lengths": suff_search_lengths + "suff_lookup": suff_lookup, + "suff_search_char_len": len(suff_search) / 4 + if suff_search_chars is not None + else 0, + "suff_search_lengths": ops.asarray1i(suff_search_lengths) if suff_search_lengths is not None - else [], + else ops.asarray1i([]), }, ) @@ -53,30 +66,30 @@ def forward( ) -> Tuple[List[Ints2d], Callable]: ops = model.ops case_sensitive: bool = model.attrs["case_sensitive"] - pref_lengths: List[int] = model.attrs["pref_lengths"] - suff_lengths: List[int] = model.attrs["suff_lengths"] + pref_lengths: Ints1d = model.attrs["pref_lengths"] + suff_lengths: Ints1d = model.attrs["suff_lengths"] pref_search: bytes = model.attrs["pref_search"] - pref_ref: bytes = model.attrs["pref_ref"] - pref_s_char_l: int = model.attr["pref_s_char_l"] - pref_search_lengths: List[int] = model.attrs["pref_search_lengths"] + pref_lookup: bytes = model.attrs["pref_lookup"] + pref_search_char_len: int = model.attrs["pref_search_char_len"] + pref_search_lengths: Ints1d = model.attrs["pref_search_lengths"] suff_search: bytes = model.attrs["suff_search"] - suff_ref: bytes = model.attrs["suff_ref"] - suff_s_char_l: int = model.attr["suff_s_char_l"] - suff_search_lengths: List[int] = model.attrs["suff_search_lengths"] + suff_lookup: bytes = model.attrs["suff_lookup"] + suff_search_char_len: int = model.attrs["suff_search_char_len"] + suff_search_lengths: Ints1d = model.attrs["suff_search_lengths"] features: List[Ints2d] = [] for doc in docs: hashes = doc.get_character_combination_hashes( - case_sensitive=case_sensitive, - pref_lengths=pref_lengths, - suff_lengths=suff_lengths, - pref_search=pref_search, - pref_ref=pref_ref, - pref_s_char_l=pref_s_char_l, - pref_search_lengths=pref_search_lengths, - suff_search=suff_search, - suff_ref=suff_ref, - suff_s_char_l=suff_s_char_l, - suff_search_lengths=suff_search_lengths, + cs=case_sensitive, + p_lengths=pref_lengths, + s_lengths=suff_lengths, + ps_search=pref_search, + ps_lookup=pref_lookup, + ps_l=pref_search_char_len, + ps_lengths=pref_search_lengths, + ss_search=suff_search, + ss_lookup=suff_lookup, + ss_l=suff_search_char_len, + ss_lengths=suff_search_lengths, ) features.append(ops.asarray2i(hashes)) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index d87cbdf5b..0a1838a3c 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -14,7 +14,7 @@ from spacy.lang.xx import MultiLanguage from spacy.language import Language from spacy.lexeme import Lexeme from spacy.tokens import Doc, Span, SpanGroup, Token -from spacy.util import get_byte_arrays_for_search_chars +from spacy.util import get_arrays_for_search_chars from spacy.vocab import Vocab from .test_underscore import clean_underscore # noqa: F401 @@ -1004,21 +1004,22 @@ def _get_unsigned_32_bit_hash(input: str) -> int: @pytest.mark.parametrize("case_sensitive", [True, False]) def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive): - doc = en_tokenizer("spaCy✨ and Prodigy") - suff_search, suff_ref = get_byte_arrays_for_search_chars("xx✨rp", case_sensitive) + ops = get_current_ops() + pref_search, pref_lookup = get_arrays_for_search_chars("Rp", case_sensitive) + suff_search, suff_lookup = get_arrays_for_search_chars("xx✨rp", case_sensitive) hashes = doc.get_character_combination_hashes( cs=case_sensitive, - pref_lengths=[1, 4, 3], - suff_lengths=[2, 3, 4, 5], - pref_search=bytes(), - pref_ref=bytes(), - pref_s_char_l = 0, - pref_search_lengths=[2], - suff_search=suff_search, - suff_ref=suff_ref, - suff_s_char_l=5 if case_sensitive else 9, - suff_search_lengths=[2,1], + p_lengths=ops.asarray1i([1, 4, 3]), + s_lengths=ops.asarray1i([2, 3, 4, 5]), + ps_search=pref_search, + ps_lookup=pref_lookup, + ps_l=2 if case_sensitive else 4, + ps_lengths=ops.asarray1i([2]), + ss_search=suff_search, + ss_lookup=suff_lookup, + ss_l=5 if case_sensitive else 9, + ss_lengths=ops.asarray1i([2, 1]), ) assert hashes[0][0] == _get_unsigned_32_bit_hash("s") @@ -1035,7 +1036,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive "spaCy" if case_sensitive else "spacy" ) - assert hashes[0][7] == _get_unsigned_32_bit_hash(" ") + assert hashes[0][7] == _get_unsigned_32_bit_hash("p ") assert hashes[0][8] == _get_unsigned_32_bit_hash("p ") assert hashes[0][9] == _get_unsigned_32_bit_hash("p") assert hashes[1][0] == _get_unsigned_32_bit_hash("✨") @@ -1067,7 +1068,7 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[3][4] == _get_unsigned_32_bit_hash("igy") assert hashes[3][5] == _get_unsigned_32_bit_hash("digy") assert hashes[3][6] == _get_unsigned_32_bit_hash("odigy") - assert hashes[3][7] == _get_unsigned_32_bit_hash(" ") + assert hashes[3][7] == _get_unsigned_32_bit_hash(" " if case_sensitive else "pr") assert hashes[3][9] == _get_unsigned_32_bit_hash("r") @@ -1077,73 +1078,93 @@ def test_get_character_combination_hashes_good_case(en_tokenizer, case_sensitive assert hashes[3][8] == _get_unsigned_32_bit_hash("rp") # check values are the same cross-platform - assert hashes[0][1] == 753329845 if case_sensitive else 18446744071614199016 - assert hashes[1][3] == 3425774424 - assert hashes[2][8] == 3076404432 + if case_sensitive: + assert hashes[0][1] == 3712103410 + else: + assert hashes[0][1] == 307339932 + assert hashes[1][3] == 2414314354 + assert hashes[2][8] == 1669671676 -def test_get_character_combination_hashes_4_byte_char_at_end(en_tokenizer): - doc = en_tokenizer("and𐌞") - suff_search, suff_ref = get_byte_arrays_for_search_chars("a", True) +def test_get_character_combination_hashes_good_case_partial(en_tokenizer): + doc = en_tokenizer("spaCy✨ and Prodigy") + ops = get_current_ops() + pref_search, pref_lookup = get_arrays_for_search_chars("rp", False) hashes = doc.get_character_combination_hashes( - cs=True, - pref_lengths=[], - suff_lengths=[1, 2, 3], - pref_search=bytes(), - pref_ref=bytes(), - pref_s_char_l = 0, - pref_search_lengths=[], - suff_search=suff_search, - suff_ref=suff_ref, - suff_s_char_l=1, - suff_search_lengths=[1], + cs=False, + p_lengths=ops.asarray1i([]), + s_lengths=ops.asarray1i([2, 3, 4, 5]), + ps_search=pref_search, + ps_lookup=pref_lookup, + ps_l=4, + ps_lengths=ops.asarray1i([2]), + ss_search=bytes(), + ss_lookup=bytes(), + ss_l=0, + ss_lengths=ops.asarray1i([]), ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("𐌞") - assert hashes[0][1] == _get_unsigned_32_bit_hash("d𐌞") - assert hashes[0][2] == _get_unsigned_32_bit_hash("nd𐌞") - assert hashes[0][3] == _get_unsigned_32_bit_hash("a") + + assert hashes[0][0] == _get_unsigned_32_bit_hash("cy") + assert hashes[0][1] == _get_unsigned_32_bit_hash("acy") + assert hashes[0][2] == _get_unsigned_32_bit_hash("pacy") + assert hashes[0][3] == _get_unsigned_32_bit_hash("spacy") + assert hashes[0][4] == _get_unsigned_32_bit_hash("p ") + assert hashes[1][0] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][1] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][2] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][3] == _get_unsigned_32_bit_hash(" ✨") + assert hashes[1][4] == _get_unsigned_32_bit_hash(" ") + assert hashes[2][0] == _get_unsigned_32_bit_hash("nd") + assert hashes[2][1] == _get_unsigned_32_bit_hash("and") + assert hashes[2][2] == _get_unsigned_32_bit_hash(" and") + assert hashes[2][3] == _get_unsigned_32_bit_hash(" and") + assert hashes[2][4] == _get_unsigned_32_bit_hash(" ") + assert hashes[3][0] == _get_unsigned_32_bit_hash("gy") + assert hashes[3][1] == _get_unsigned_32_bit_hash("igy") + assert hashes[3][2] == _get_unsigned_32_bit_hash("digy") + assert hashes[3][3] == _get_unsigned_32_bit_hash("odigy") + assert hashes[3][4] == _get_unsigned_32_bit_hash("pr") -def test_get_character_combination_hashes_4_byte_char_in_middle(en_tokenizer): - doc = en_tokenizer("and𐌞a") - hashes = doc.get_character_combination_hashes( - case_sensitive=False, - pref_lengths=[], - suff_lengths=[1, 2, 3, 4], - pref_search_chars="", - pref_search_lengths=[], - suff_search_chars="a", - suff_search_lengths=[1, 2], - ) - assert hashes[0][0] == _get_unsigned_32_bit_hash("a") - assert hashes[0][2] == _get_unsigned_32_bit_hash("𐌞a") - assert hashes[0][3] == _get_unsigned_32_bit_hash("d𐌞a") - assert hashes[0][4] == _get_unsigned_32_bit_hash("a") - assert hashes[0][5] == _get_unsigned_32_bit_hash("aa") -def test_get_character_combination_hashes_4_byte_special_char(en_tokenizer): - doc = en_tokenizer("and𐌞") - with pytest.raises(ValueError): - doc.get_character_combination_hashes( - case_sensitive=True, - pref_lengths=[], - suff_lengths=[2, 3, 4, 5], - pref_search_chars="", - pref_search_lengths=[], - suff_search_chars="𐌞", - suff_search_lengths=[2], - ) +def test_get_character_combination_hashes_copying_in_middle(en_tokenizer): + doc = en_tokenizer("sp𐌞Cé") + ops = get_current_ops() + + for p_length in range(1, 8): + for s_length in range(1, 8): + hashes = doc.get_character_combination_hashes( + cs=False, + p_lengths=ops.asarray1i([p_length]), + s_lengths=ops.asarray1i([s_length]), + ps_search=bytes(), + ps_lookup=bytes(), + ps_l=0, + ps_lengths=ops.asarray1i([]), + ss_search=bytes(), + ss_lookup=bytes(), + ss_l=0, + ss_lengths=ops.asarray1i([]), + ) + + assert hashes[0][0] == _get_unsigned_32_bit_hash("sp𐌞cé "[:p_length]) + assert hashes[0][1] == _get_unsigned_32_bit_hash(" sp𐌞cé"[8 - s_length :]) def test_character_combination_hashes_empty_lengths(en_tokenizer): doc = en_tokenizer("and𐌞") - assert doc.get_character_combination_hashes( - case_sensitive=True, - pref_lengths=[], - suff_lengths=[], - pref_search_chars="", - pref_search_lengths=[], - suff_search_chars="", - suff_search_lengths=[], + ops = get_current_ops() + hashes = doc.get_character_combination_hashes( + cs=True, + p_lengths=ops.asarray1i([]), + s_lengths=ops.asarray1i([]), + ps_search=bytes(), + ps_lookup=bytes(), + ps_l=0, + ps_lengths=ops.asarray1i([]), + ss_search=bytes(), + ss_lookup=bytes(), + ss_l=0, + ss_lengths=ops.asarray1i([]), ).shape == (1, 0) diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py index 33acbc3f2..b4b5744ad 100644 --- a/spacy/tests/test_util.py +++ b/spacy/tests/test_util.py @@ -1,13 +1,13 @@ import spacy -def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): +def test_get_arrays_for_search_chars_width_2_not_case_sensitive(): ( search, - ref, - ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", False) + lookup, + ) = spacy.util.get_arrays_for_search_chars("bféwfw", False) assert ( - ref + lookup == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" ) @@ -17,39 +17,39 @@ def test_get_byte_arrays_for_search_chars_width_2_not_case_sensitive(): ) -def test_get_byte_arrays_for_search_chars_width_2_case_sensitive(): +def test_get_arrays_for_search_chars_width_2_case_sensitive(): ( search, - ref, - ) = spacy.util.get_byte_arrays_for_search_chars("bféwfw", True) + lookup, + ) = spacy.util.get_arrays_for_search_chars("bféwfw", True) assert ( - ref == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" + lookup == search == b"b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00" ) -def test_get_byte_arrays_for_search_chars_width_4_not_case_sensitive(): +def test_get_arrays_for_search_chars_width_4_not_case_sensitive(): ( search, - ref, - ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) + lookup, + ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", False) assert ( search == b"\x1e\x03\x01\x00B\x00\x00\x00F\x00\x00\x00W\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" ) assert ( - ref + lookup == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xe9\x00\x00\x00\xe9\x00\x00\x00" ) -def test_get_byte_arrays_for_search_chars_width_4_case_sensitive(): +def test_get_arrays_for_search_chars_width_4_case_sensitive(): ( search, - ref, - ) = spacy.util.get_byte_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) - assert search == ref + lookup, + ) = spacy.util.get_arrays_for_search_chars("bfé𐌞wf𐌞wÉ", True) + assert search == lookup assert ( - ref + lookup == b"\x1e\x03\x01\x00b\x00\x00\x00f\x00\x00\x00w\x00\x00\x00\xc9\x00\x00\x00\xe9\x00\x00\x00" ) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 1d97d14c4..39a199ff3 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -18,6 +18,11 @@ ctypedef fused LexemeOrToken: const_TokenC_ptr +cdef extern from "unicodeobject.h": + bint Py_UNICODE_ISUPPER(Py_UCS4 ch) + Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch) + + cdef int set_children_from_heads(TokenC* tokens, int start, int end) except -1 @@ -33,25 +38,34 @@ cdef int token_by_end(const TokenC* tokens, int length, int end_char) except -2 cdef int [:,:] _get_lca_matrix(Doc, int start, int end) -cdef void _populate_aff_buf( +cdef void _copy_chars( + Py_UCS4* target, + const Py_UCS4* source, + const int length, + const bint to_lower +) + + +cdef void _set_affixes( const Py_UCS4* text_buf, const int tok_idx, const int tok_len, Py_UCS4* aff_buf, - const int pref_length, - const int suff_length, + const int pref_len, + const int suff_len, const bint to_lower ) -cdef void _populate_search_buf( + +cdef void _search_for_chars( const Py_UCS4* text_buf, const int tok_idx, const int tok_len, Py_UCS4* search_buf, - Py_UCS4* ref_buf, + Py_UCS4* lookup_buf, const int search_buf_len, - Py_UCS4* finding_buf, - const int finding_buf_len, + Py_UCS4* result_buf, + const int result_buf_len, bint suffs_not_prefs ) diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi index 975367208..7e4962953 100644 --- a/spacy/tokens/doc.pyi +++ b/spacy/tokens/doc.pyi @@ -1,7 +1,7 @@ from typing import Callable, Protocol, Iterable, Iterator, Optional from typing import Union, Tuple, List, Dict, Any, overload from cymem.cymem import Pool -from thinc.types import Floats1d, Floats2d, Ints2d +from thinc.types import Floats1d, Floats2d, Ints1d, Ints2d from .span import Span from .token import Token from ._dict_proxies import SpanGroups @@ -177,17 +177,17 @@ class Doc: def get_character_combination_hashes( self, *, - case_sensitive: bool, - pref_lengths: List[int], - suff_lengths: List[int], + cs: bool, + pref_lengths: Ints1d, + suff_lengths: Ints1d, pref_search_chars: str, - pref_ref_chars: str, + pref_lookup_chars: str, pref_search_char_length: int, - pref_search_lengths: List[int], + pref_search_lengths: Ints1d, suff_search_chars: str, - suff_ref_chars: str, + suff_lookup_chars: str, suff_search_char_length: int, - suff_search_lengths: List[int], + suff_search_lengths: Ints1d, ) -> Ints2d: ... @staticmethod def _get_array_attrs() -> Tuple[Any]: ... diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c0d3890fd..33f141ced 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -3,6 +3,7 @@ from typing import Set, List cimport cython cimport numpy as np +from cpython cimport array from libc.string cimport memcpy, memcmp, memset from libc.math cimport sqrt from libc.stdint cimport int32_t, uint64_t @@ -105,16 +106,6 @@ class SetEntsDefault(str, Enum): return list(cls.__members__.keys()) -cdef extern from "unicodeobject.h": - Py_UCS4 PyUnicode_READ(int kind, void *data, int index) - void* PyUnicode_DATA(void* o) - void PyUnicode_READY(void * o) - int PyUnicode_KIND(void *data) - int PyUnicode_IS_COMPACT(void *data) - - Py_UCS4 Py_UNICODE_TOLOWER(Py_UCS4 ch) - - cdef class Doc: """A sequence of Token objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to compressed binary @@ -1745,103 +1736,129 @@ cdef class Doc: return output - def get_character_combination_hashes( - self, + def get_character_combination_hashes(self, *, - bint cs, - pref_lengths: List[int], - suff_lengths: List[int], - char* pref_search, - char* pref_ref, - int pref_s_char_l, - pref_search_lengths: List[int], - char* suff_search, - char* suff_ref, - int suff_s_char_l, - suff_search_lengths: List[int], + const bint cs, + np.ndarray p_lengths, + np.ndarray s_lengths, + const char* ps_search, + const char* ps_lookup, + const int ps_l, + np.ndarray ps_lengths, + const char* ss_search, + const char* ss_lookup, + const int ss_l, + np.ndarray ss_lengths, ): """ Returns a 2D NumPy array where the rows represent tokens and the columns represent hashes of various character combinations - derived from the string (text/orth) of each token. + derived from the raw text of each token. + + Generally: + p_ variables relate to prefixes (affixes starting at the beginning of the word) + s_ variables relate to suffixes (affixes starting at the end of the word) + ps_ variables relate to searches starting at the beginning of the word + ss_ variables relate to searches starting at the end of the word - cs: if *False*, the lower-case version of each token string is used as the basis for generating hashes. Note that - if *cs==False*, upper-case characters in *search_chars* will not be found in token strings. - pref_lengths: an integer list specifying the lengths of prefixes to be hashed. For example, if *pref_lengths==[2, 3]*, + cs: if *False*, hashes are generated based on the lower-case version of each token. + p_lengths: an Ints1d specifying the lengths of prefixes to be hashed. For example, if *p_lengths==[2, 3]*, the prefixes hashed for "spaCy" would be "sp" and "spa". - suff_lengths: an integer list specifying the lengths of suffixes to be hashed. For example, if *suff_lengths==[2, 3]* and - *case_sensitive == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". - pref_search_chars: a string containing characters to search for within each token, starting at the beginning. - pref_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if - *pref_search_lengths==[1, 2]*, *pref_search_chars=="aC" and *cs==False*, the searched strings hashed for + s_lengths: an Ints1d specifying the lengths of suffixes to be hashed. For example, if *s_lengths==[2, 3]* and + *cs == True*, the suffixes hashed for "spaCy" would be "Cy" and "aCy". + ps_search: a byte array containing characters to search for within each token, starting at the beginning. + ps_lookup: a byte array containing characters that are added to the result string when a character at + the corresponding position in *ps_search* is matched. Having separate search and lookup arrays enables + case-insensitivity to be handled efficiently. + ps_search_l: the number of characters in *ps_search* and hence also in *ps_lookup* + ps_lengths: an Ints1d specifying the lengths of search results to be hashed. For example if + *ps_lengths==[1, 2]*, *ps_search=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "a" and "ac". - suff_search_chars: a string containing characters to search for within each token, starting at the end. - suff_search_lengths: an integer list specifying the lengths of search results to be hashed. For example if + ss_search: a byte array containing characters to search for within each token, starting at the end. + ss_lookup: a byte array containing characters that are added to the result string when a character at + the corresponding position in *ss_search* is matched. Having separate search and lookup arrays enables + case-insensitivity to be handled efficiently. + ss_l: the number of characters in *ss_search* and hence also in *ss_lookup* + ss_lengths: an integer list specifying the lengths of search results to be hashed. For example if *suff_search_lengths==[1, 2]*, *suff_search_chars=="aC" and *cs==False*, the searched strings hashed for "spaCy" would be "c" and "ca". For a document with tokens ["spaCy", "and", "Prodigy"], the NumPy array returned by - *get_character_combination_hashes(True, [2], [2, 4, 6], "yC", [1], [2])* would correspond to + *get_character_combination_hashes(True, [2], [2, 4, 6], "", "", 0, [], "yC", "yC", 2, [1, 2])* would correspond to - [[hash("sp"), [hash("Cy"), hash("paCy"), hash("spaCy"), hash("y"), hash("yC")], - [hash("an"), hash("nd"), hash("and", hash("and"), hash(" "), hash(" "))], + [[hash("sp"), [hash("Cy"), hash("paCy"), hash(" spaCy"), hash("y"), hash("yC")], + [hash("an"), hash("nd"), hash(" and", hash(" and"), hash(" "), hash(" "))], [hash("Pr") ,hash("gy"), hash("digy"), hash("rodigy"), hash("y"), hash("y ")]] """ - cdef int max_pref_l = max(pref_lengths) if len(pref_lengths) > 0 else 0 - cdef int max_suff_l = max(suff_lengths) if len(suff_lengths) > 0 else 0 - cdef int aff_buf_l = max_pref_l + max_suff_l - cdef int max_s_pref_l = max(pref_search_lengths) if len(pref_search_lengths) > 0 else 0 - cdef int max_s_suff_l = max(suff_search_lengths) if len(suff_search_lengths) > 0 else 0 - - cdef Py_UCS4* aff_buf = self.mem.alloc(4, aff_buf_l) - cdef Py_UCS4* pref_s_buf = pref_search - cdef Py_UCS4* pref_r_buf = pref_ref - cdef Py_UCS4* pref_f_buf = self.mem.alloc(4, max_s_pref_l) - cdef Py_UCS4* suff_s_buf = suff_search - cdef Py_UCS4* suff_r_buf = suff_ref - cdef Py_UCS4* suff_f_buf = self.mem.alloc(4, max_s_suff_l) - + # Encode the document text cdef bytes encoded_text = self.text.encode("utf-32le") cdef char* intermediate_text = encoded_text cdef Py_UCS4* text_buf = intermediate_text - cdef unsigned int num_toks = len(self), aff_len - cdef unsigned int h_pref_n = len(pref_lengths) - cdef unsigned int h_suff_n = len(suff_lengths), h_suff_end_idx = len(pref_lengths) + len(suff_lengths) - cdef unsigned int h_pref_s_n = len(pref_search_lengths), h_pref_s_end_idx = h_suff_end_idx + h_pref_s_n - cdef unsigned int h_suff_s_n = len(suff_search_lengths), h_suff_s_end_idx = h_pref_s_end_idx + h_suff_s_n - cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, h_suff_s_end_idx), dtype="int64") + # Define the result array and work out what is used for what in axis 1 + cdef int num_toks = len(self) + cdef int p_h_num = p_lengths.shape[0] + cdef int s_h_num = s_lengths.shape[0], s_h_end = p_h_num + s_h_num + cdef int ps_h_num = ps_lengths.shape[0], ps_h_end = s_h_end + ps_h_num + cdef int ss_h_num = ss_lengths.shape[0], ss_h_end = ps_h_end + ss_h_num + cdef np.ndarray[np.int64_t, ndim=2] hashes = numpy.empty((num_toks, ss_h_end), dtype="int64") + + # Determine the maximum possible lengths of the affixes to work out how big the buffers need to be + cdef int p_max_l = max(p_lengths) if p_h_num > 0 else 0 + cdef int s_max_l = max(s_lengths) if s_h_num > 0 else 0 + cdef int ps_max_l = max(ps_lengths) if ps_h_num > 0 else 0 + cdef int ss_max_l = max(ss_lengths) if ss_h_num > 0 else 0 + + # Define / allocate buffer (pr/sr: result buffers) + cdef int aff_buf_l = p_max_l + s_max_l + cdef Py_UCS4* aff_buf = self.mem.alloc(aff_buf_l, sizeof(Py_UCS4)) + cdef Py_UCS4* ps_buf = ps_search + cdef Py_UCS4* pl_buf = ps_lookup + cdef Py_UCS4* pr_buf = self.mem.alloc(ps_max_l, sizeof(Py_UCS4)) + cdef Py_UCS4* ss_buf = ss_search + cdef Py_UCS4* sl_buf = ss_lookup + cdef Py_UCS4* sr_buf = self.mem.alloc(ss_max_l, sizeof(Py_UCS4)) + # Define memory views on length arrays + cdef int[:] p_v = p_lengths + cdef int[:] s_v = s_lengths + cdef int[:] ps_v = ps_lengths + cdef int[:] ss_v = ss_lengths + + # Define working variables cdef TokenC tok_c + cdef int tok_i, tok_idx, tok_len, aff_len for tok_i in range(num_toks): tok_c = self.c[tok_i] tok_idx = tok_c.idx tok_len = tok_c.lex.length - _populate_aff_buf(text_buf, tok_idx, tok_len, aff_buf, max_pref_l, max_suff_l, not cs) - _populate_search_buf(text_buf, tok_idx, tok_len, pref_s_buf, pref_r_buf, pref_s_char_l, pref_f_buf, max_s_pref_l, False) - _populate_search_buf(text_buf, tok_idx, tok_len, suff_s_buf, suff_r_buf, suff_s_char_l, suff_f_buf, max_s_suff_l, True) - - for hash_idx in range(h_pref_n): - aff_len = pref_lengths[hash_idx] - hashes[tok_i, hash_idx] = hash32(aff_buf, aff_len * 4, 0) - - for hash_idx in range(h_pref_n, h_suff_end_idx): - aff_len = suff_lengths[hash_idx - h_pref_n] - hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * 4, 0) + if aff_buf_l > 0: + _set_affixes(text_buf, tok_idx, tok_len, aff_buf, p_max_l, s_max_l, not cs) - for hash_idx in range(h_suff_end_idx, h_pref_s_end_idx): - aff_len = pref_search_lengths[hash_idx - h_suff_end_idx] - hashes[tok_i, hash_idx] = hash32(pref_f_buf, aff_len * 4, 0) + for hash_idx in range(p_h_num): + hashes[tok_i, hash_idx] = hash32(aff_buf, p_v[hash_idx] * sizeof(Py_UCS4), 0) + + for hash_idx in range(p_h_num, s_h_end): + aff_len = s_v[hash_idx - p_h_num] + hashes[tok_i, hash_idx] = hash32(aff_buf + aff_buf_l - aff_len, aff_len * sizeof(Py_UCS4), 0) + + if ps_h_num > 0: + _search_for_chars(text_buf, tok_idx, tok_len, ps_buf, pl_buf, ps_l, pr_buf, ps_max_l, False) + for hash_idx in range(s_h_end, ps_h_end): + aff_len = ps_v[hash_idx - s_h_end] + hashes[tok_i, hash_idx] = hash32(pr_buf, aff_len * sizeof(Py_UCS4), 0) - for hash_idx in range(h_pref_s_end_idx, h_suff_s_end_idx): - aff_len = suff_search_lengths[hash_idx - h_pref_s_end_idx] - hashes[tok_i, hash_idx] = hash32(suff_f_buf, aff_len * 4, 0) + if ss_h_num > 0: + _search_for_chars(text_buf, tok_idx, tok_len, ss_buf, sl_buf, ss_l, sr_buf, ss_max_l, True) + for hash_idx in range(ps_h_end, ss_h_end): + aff_len = ss_v[hash_idx - ps_h_end] + hashes[tok_i, hash_idx] = hash32(sr_buf, aff_len * sizeof(Py_UCS4), 0) self.mem.free(aff_buf) - self.mem.free(pref_f_buf) - self.mem.free(suff_f_buf) + self.mem.free(pr_buf) + self.mem.free(sr_buf) return hashes @staticmethod @@ -2025,76 +2042,103 @@ cdef int [:,:] _get_lca_matrix(Doc doc, int start, int end): return lca_matrix -cdef void _populate_aff_buf( +cdef void _copy_chars( + Py_UCS4* target, + const Py_UCS4* source, + const int length, + const bint to_lower +): + """Copies *length* Py_UCS4 characters from *source* to *target*. If *to_lower==True*, converts + any upper-case characters to lower case within the target buffer. + """ + memcpy(target, source, length * sizeof(Py_UCS4)) + cdef int idx + if to_lower: + for idx in range(length): + if Py_UNICODE_ISUPPER(target[idx]): + target[idx] = Py_UNICODE_TOLOWER(target[idx]) + + +cdef void _set_affixes( const Py_UCS4* text_buf, const int tok_idx, const int tok_len, Py_UCS4* aff_buf, - const int pref_length, - const int suff_length, + const int pref_len, + const int suff_len, const bint to_lower ): - """ Populate a buffer of length p+s with the first p and the last s characters of a word within a string. - If the word is shorter than p and/or s, the empty character positions in the middle are filled with zeros. + """ Populate a buffer of length pref+suff with the first pref and the last suff characters of a word within a string. + If the word is shorter than pref and/or suff, the empty character positions in the middle are filled with zeros. - str_data_ptr: a pointer to the raw data in the containing string, which must be in canonical - Unicode form (see PEP 393). - kind: the number of bytes occupied by each character in the containing string. - word_idx: the index of the first character of the word within the containing string. - word_len: the length of the word. + text_buf: a pointer to a UTF-32LE representation of the containing string. + tok_idx: the index of the first character of the word within the containing string. + tok_len: the length of the word. aff_buf: the buffer to populate. - pref_length: the length of the prefix. - suff_length: the length of the suffix. + pref_len: the length of the prefix. + suff_len: the length of the suffix. to_lower: if *True*, any upper case characters in either affix are converted to lower case. """ - cdef int aff_buf_idx = 0, buf_size = pref_length + suff_length, in_word_idx + cdef int aff_buf_idx = 0, aff_buf_len = pref_len + suff_len, in_word_idx, filled_pref_len - while aff_buf_idx < pref_length and aff_buf_idx < tok_len: + if pref_len > 0: + filled_pref_len = pref_len if pref_len < tok_len else tok_len + _copy_chars(aff_buf, text_buf + tok_idx, filled_pref_len, to_lower) + aff_buf_idx = filled_pref_len - memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + aff_buf_idx, 4) - if to_lower: - aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx]) - aff_buf_idx += 1 + if tok_len < pref_len: + memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (pref_len - tok_len)) + aff_buf_idx = aff_buf_len - suff_len + if tok_len < suff_len: + memset(aff_buf + aff_buf_idx, 0, sizeof(Py_UCS4) * (suff_len - tok_len)) + aff_buf_idx = aff_buf_len - tok_len - if aff_buf_idx < buf_size - tok_len: - # fill out the empty middle part of the buffer with zeros - memset(aff_buf, 0, buf_size - suff_length - aff_buf_idx) + if suff_len > 0: + in_word_idx = aff_buf_idx + tok_len - aff_buf_len + if in_word_idx < pref_len: + memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, sizeof(Py_UCS4) * (filled_pref_len - in_word_idx)) + aff_buf_idx += filled_pref_len - in_word_idx + if aff_buf_idx < aff_buf_len: + _copy_chars(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, aff_buf_len - aff_buf_idx, to_lower) - while aff_buf_idx < buf_size: - in_word_idx = aff_buf_idx + tok_len - buf_size - # for suffixes we have to track the in-word index separately from the in-buffer index - if in_word_idx < pref_length: - # we've already retrieved this character as part of the prefix, so copy it from there - # as that's quicker than retrieving it from the input string a second time - memcpy(aff_buf + aff_buf_idx, aff_buf + in_word_idx, 4) - else: - memcpy(aff_buf + aff_buf_idx, text_buf + tok_idx + in_word_idx, 4) - if to_lower: - aff_buf[aff_buf_idx] = Py_UNICODE_TOLOWER(aff_buf[aff_buf_idx]) - aff_buf_idx += 1 -cdef void _populate_search_buf( +cdef void _search_for_chars( const Py_UCS4* text_buf, const int tok_idx, const int tok_len, Py_UCS4* search_buf, - Py_UCS4* ref_buf, + Py_UCS4* lookup_buf, const int search_buf_len, - Py_UCS4* finding_buf, - const int finding_buf_len, + Py_UCS4* result_buf, + const int result_buf_len, bint suffs_not_prefs ): - cdef unsigned int finding_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx - cdef unsigned int search_buf_idx - cdef int cmp_res + """ Search a word within a string for characters within *search_buf*, starting at the beginning or + end depending on the value of *suffs_not_prefs*. Wherever a character from *search_buf* matches, + the corresponding character from *lookup_buf* is added to *result_buf*. - while finding_buf_idx < finding_buf_len: + text_buf: a pointer to a UTF-32LE representation of the containing string. + tok_idx: the index of the first character of the word within the containing string. + tok_len: the length of the word. + search_buf: the characters to search for (ordered). + lookup_buf: characters corresponding to *search_buf* to add to *result_buf* in the case of a match. + Having separate search and lookup arrays enables case-insensitivity to be handled efficiently. + search_buf_len: the length of *search_buf* and hence also of *lookup_buf*. + result_buf: the buffer in which to place the results. + result_buf_len: the length of *result_buf*. + suffs_not_prefs: if *True*, searching starts from the end of the word; if *False*, from the beginning. + """ + cdef int result_buf_idx = 0, text_string_idx = tok_idx + (tok_len - 1) if suffs_not_prefs else tok_idx + cdef int search_buf_idx + cdef int cmp_result + + while result_buf_idx < result_buf_len: for search_buf_idx in range (search_buf_len): - cmp_res = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, 4) - if cmp_res == 0: - memcpy(finding_buf + finding_buf_idx, ref_buf + search_buf_idx, 4) - finding_buf_idx += 1 - if cmp_res >= 0: + cmp_result = memcmp(search_buf + search_buf_idx, text_buf + text_string_idx, sizeof(Py_UCS4)) + if cmp_result == 0: + memcpy(result_buf + result_buf_idx, lookup_buf + search_buf_idx, sizeof(Py_UCS4)) + result_buf_idx += 1 + if cmp_result >= 0: break if suffs_not_prefs: if text_string_idx <= tok_idx: @@ -2105,11 +2149,11 @@ cdef void _populate_search_buf( if text_string_idx >= tok_idx + tok_len: break - if finding_buf_idx < finding_buf_len: - memset(finding_buf + finding_buf_idx, 0, finding_buf_len - finding_buf_idx) + # fill in any unused characters in the result buffer with zeros + if result_buf_idx < result_buf_len: + memset(result_buf + result_buf_idx, 0, (result_buf_len - result_buf_idx) * sizeof(Py_UCS4)) - def pickle_doc(doc): bytes_data = doc.to_bytes(exclude=["vocab", "user_data", "user_hooks"]) hooks_and_data = (doc.user_data, doc.user_hooks, doc.user_span_hooks, diff --git a/spacy/util.py b/spacy/util.py index 89e51118e..8eaaf0889 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1737,7 +1737,7 @@ def all_equal(iterable): return next(g, True) and not next(g, False) -def get_byte_arrays_for_search_chars( +def get_arrays_for_search_chars( search_chars: str, case_sensitive: bool ) -> Tuple[bytes, bytes]: """ @@ -1746,14 +1746,14 @@ def get_byte_arrays_for_search_chars( for search characters. The encoding is little-endian regardless of architecture, as this is what is expected by the murmurhash library used downstream. - Alongside the "search byte array" against which words from document texts are compared - is the "ref byte array". When a character from the search byte array is matched, - the character at the corresponding position in the ref byte array is added to the - byte sequence of the configured length that is then hashed. This enables case-sensitivity + Alongside the "search array" against which words from document texts are compared + is the "lookup array". When a character from the search array is matched, + the character at the corresponding position in the lookup array is added to the + sequence that then goes on to be hashed. This enables case-sensitivity to be handled without converting the case of the words being searched: if *case_sensitive==False*, the lower- or uppercase counterparts of any characters that - have case are added to the search byte arrays, and both the original character and its - other-cased counterpart map to the lower-case version in the ref byte array. + have case are added to the search array, and both the original character and its + other-cased counterpart map to the lower-case version in the lookup array. """ def encode(ch: str) -> bytes: @@ -1762,8 +1762,8 @@ def get_byte_arrays_for_search_chars( """ return ch.encode("UTF-32LE") - def add_to_byte_arrays( - search: List[bytes], ref: List[bytes], ch: str + def add_to_arrays( + search: List[bytes], lookup: List[bytes], ch: str ) -> None: """Add the byte representations of *ch* to the two byte array lists. """ @@ -1771,36 +1771,36 @@ def get_byte_arrays_for_search_chars( if not case_sensitive and ch.islower(): if this_char_bytes not in search: search.append(this_char_bytes) - ref.append(this_char_bytes) + lookup.append(this_char_bytes) upper_char_bytes = encode(ch.upper()) if upper_char_bytes not in search: search.append(upper_char_bytes) - ref.append(this_char_bytes) + lookup.append(this_char_bytes) elif not case_sensitive and ch.isupper(): lower_char_bytes = encode(ch.lower()) if this_char_bytes not in search: search.append(this_char_bytes) - ref.append(lower_char_bytes) + lookup.append(lower_char_bytes) if lower_char_bytes not in search: search.append(lower_char_bytes) - ref.append(lower_char_bytes) + lookup.append(lower_char_bytes) elif this_char_bytes not in search: search.append(this_char_bytes) - ref.append(this_char_bytes) + lookup.append(this_char_bytes) def get_ordered_raw_bytes( - search: List[bytes], ref: List[bytes] + search: List[bytes], lookup: List[bytes] ) -> Tuple[bytes, bytes]: """Flatten the two lists, ordering both by the entries in *search* using the native endianness of the platform. """ num_search = [list(entry) for entry in search] search = [entry for _, entry in sorted(zip(num_search, search))] - ref = [entry for _, entry in sorted(zip(num_search, ref))] - return b"".join(search), b"".join(ref) + lookup = [entry for _, entry in sorted(zip(num_search, lookup))] + return b"".join(search), b"".join(lookup) search: List[bytes] = [] - ref: List[bytes] = [] + lookup: List[bytes] = [] for ch in search_chars: - add_to_byte_arrays(search, ref, ch) - return get_ordered_raw_bytes(search, ref) + add_to_arrays(search, lookup, ch) + return get_ordered_raw_bytes(search, lookup) diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 099c8d5d6..62ce83609 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -218,28 +218,6 @@ whose presence before or after characters that would otherwise alternate prevents the alternation from occurring, e.g. an `ä` in a German plural noun does not become `a` if it is the third or fourth vowel from the end of the word. -Internally, the model converts each token string to -[UTF-16](https://www.ietf.org/rfc/rfc2781.txt) and assumes that each character -from the string occupies two bytes. This assumption holds for all characters in -the Basic Multilingual Plane, which encompasses all characters that are ever -likely to be of interest when extracting features. There are, however, -characters like emojis that are in the Extended Multilingual Plane and occupy -four bytes, although importantly neither of the two byte pairs that make up such -a representation can be a valid two-byte character in its own right. The -following considerations apply to the processing of four-byte characters: - -- An exceptional four-byte character within a text consisting mostly of two-byte - characters will probably be ignored by the neural network accepting the - embedding layer as not matching any of the learned features. -- If anyone did want to train a model for a language like Lycian that is - generally written in four-byte characters, prefix and suffix features can - still be extracted, but the length specifications should all be doubled, i.e. - `[2,4,6]` to extract one-, two- and three-character affixes. In such a - situation length specifications that are odd numbers would serve no useful - purpose since they would refer to half-characters. -- Four-byte characters are not accepted within search character specification - strings and lead to an error being thrown. - | Name | Description | | ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. If static vectors are included, a learned linear layer is used to map the vectors to the specified width before concatenating it with the other embedding outputs. A single maxout layer is then used to reduce the concatenated vectors to the final width. ~~int~~ |