mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Update docs to reflect StringStore changes
This commit is contained in:
parent
89bf635cbe
commit
414193e9ba
|
@ -74,9 +74,9 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
|||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||
return string
|
||||
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
"""Lookup strings by 64-bit hash"""
|
||||
"""Look up strings by 64-bit hashes."""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
"""Create the StringStore.
|
||||
|
||||
|
@ -92,9 +92,9 @@ cdef class StringStore:
|
|||
self.add(string)
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given hash ID, or vice versa.
|
||||
"""Retrieve a string from a given hash, or vice versa.
|
||||
|
||||
string_or_id (bytes or unicode or uint64): The value to encode.
|
||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
||||
Returns (unicode or uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
|
@ -123,6 +123,11 @@ cdef class StringStore:
|
|||
return decode_Utf8Str(utf8str)
|
||||
|
||||
def add(self, string):
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (unicode): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
if isinstance(string, unicode):
|
||||
if string in SYMBOLS_BY_STR:
|
||||
return SYMBOLS_BY_STR[string]
|
||||
|
|
|
@ -7,30 +7,30 @@
|
|||
</style>
|
||||
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
|
||||
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
|
||||
<text class="svg__vocab__text" dx="-0.5em" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">31979...</text>
|
||||
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
|
||||
<text class="svg__vocab__text" dx="-0.5em" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">46904...</text>
|
||||
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
|
||||
<text class="svg__vocab__text" dx="-0.7em" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">37020...</text>
|
||||
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
|
||||
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">"coffee"</text>
|
||||
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
|
||||
<text class="svg__vocab__text-box" dx="-0.5em" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">31979…</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">"I"</text>
|
||||
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text>
|
||||
<text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">46904…</text>
|
||||
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
|
||||
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">"love"</text>
|
||||
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
|
||||
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
|
||||
<text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">37020…</text>
|
||||
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
|
||||
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
|
||||
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
|
||||
|
|
Before Width: | Height: | Size: 7.8 KiB After Width: | Height: | Size: 7.9 KiB |
|
@ -2,14 +2,16 @@
|
|||
|
||||
include ../../_includes/_mixins
|
||||
|
||||
p Map strings to and from integer IDs.
|
||||
p
|
||||
| Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values
|
||||
| instead of integer IDs. This ensures that strings always map to the
|
||||
| same ID, even from different #[code StringStores].
|
||||
|
||||
+h(2, "init") StringStore.__init__
|
||||
+tag method
|
||||
|
||||
p
|
||||
| Create the #[code StringStore]. Note that a newly initialised store will
|
||||
| always include an empty string #[code ''] at position #[code 0].
|
||||
| Create the #[code StringStore].
|
||||
|
||||
+aside-code("Example").
|
||||
from spacy.strings import StringStore
|
||||
|
@ -44,17 +46,18 @@ p Get the number of strings in the store.
|
|||
+h(2, "getitem") StringStore.__getitem__
|
||||
+tag method
|
||||
|
||||
p Retrieve a string from a given integer ID, or vice versa.
|
||||
p Retrieve a string from a given hash, or vice versa.
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
int_id = stringstore[u'apple'] # 1
|
||||
assert stringstore[int_id] == u'apple'
|
||||
apple_hash = stringstore[u'apple']
|
||||
assert apple_hash == 8566208034543834098L
|
||||
assert stringstore[apple_hash] == u'apple'
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string_or_id]
|
||||
+cell bytes, unicode or int
|
||||
+cell bytes, unicode or uint64
|
||||
+cell The value to encode.
|
||||
|
||||
+footrow
|
||||
|
@ -94,7 +97,7 @@ p
|
|||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
all_strings = [s for s in stringstore]
|
||||
assert all_strings == [u'', u'apple', u'orange']
|
||||
assert all_strings == [u'apple', u'orange']
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+footrow
|
||||
|
@ -102,6 +105,30 @@ p
|
|||
+cell unicode
|
||||
+cell A string in the store.
|
||||
|
||||
+h(2, "add") StringStore.add
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
||||
p Add a string to the #[code StringStore].
|
||||
|
||||
+aside-code("Example").
|
||||
stringstore = StringStore([u'apple', u'orange'])
|
||||
stringstore.add(u'banana')
|
||||
assert len(stringstore) == 3
|
||||
assert stringstore[u'banana'] == 2525716904149915114L
|
||||
|
||||
+table(["Name", "Type", "Description"])
|
||||
+row
|
||||
+cell #[code string]
|
||||
+cell unicode
|
||||
+cell The string to add.
|
||||
|
||||
+footrow
|
||||
+cell returns
|
||||
+cell uint64
|
||||
+cell The string's hash value.
|
||||
|
||||
|
||||
+h(2, "to_disk") StringStore.to_disk
|
||||
+tag method
|
||||
+tag-new(2)
|
||||
|
|
|
@ -4,10 +4,10 @@ p
|
|||
| Whenever possible, spaCy tries to store data in a vocabulary, the
|
||||
| #[+api("vocab") #[code Vocab]], that will be
|
||||
| #[strong shared by multiple documents]. To save memory, spaCy also
|
||||
| encodes all strings to #[strong integer IDs] – in this case for example,
|
||||
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
|
||||
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy
|
||||
| only "speaks" in integer IDs.
|
||||
| encodes all strings to #[strong hash values] – in this case for example,
|
||||
| "coffee" has the hash #[code 3197928453018144401L]. Entity labels like
|
||||
| "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
|
||||
| spaCy only "speaks" in hash values.
|
||||
|
||||
+aside
|
||||
| #[strong Token]: A word, punctuation mark etc. #[em in context], including
|
||||
|
@ -16,8 +16,8 @@ p
|
|||
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
|
||||
| #[strong Doc]: A processed container of tokens in context.#[br]
|
||||
| #[strong Vocab]: The collection of lexemes.#[br]
|
||||
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for
|
||||
| example #[code 3672] → "coffee".
|
||||
| #[strong StringStore]: The dictionary mapping hash values to strings, for
|
||||
| example #[code 3197928453018144401L] → "coffee".
|
||||
|
||||
+image
|
||||
include ../../../assets/img/docs/vocab_stringstore.svg
|
||||
|
@ -27,26 +27,26 @@ p
|
|||
p
|
||||
| If you process lots of documents containing the word "coffee" in all
|
||||
| kinds of different contexts, storing the exact string "coffee" every time
|
||||
| would take up way too much space. So instead, spaCy assigns it an ID
|
||||
| would take up way too much space. So instead, spaCy hashes the string
|
||||
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can
|
||||
| think of the #[code StringStore] as a
|
||||
| #[strong lookup table that works in both directions] – you can look up a
|
||||
| string to get its ID, or an ID to get its string:
|
||||
| string to get its hash, or a hash to get its string:
|
||||
|
||||
+code.
|
||||
doc = nlp(u'I like coffee')
|
||||
assert doc.vocab.strings[u'coffee'] == 3572
|
||||
assert doc.vocab.strings[3572] == u'coffee'
|
||||
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
|
||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee'
|
||||
|
||||
p
|
||||
| Now that all strings are encoded, the entries in the vocabulary
|
||||
| #[strong don't need to include the word text] themselves. Instead,
|
||||
| they can look it up in the #[code StringStore] via its integer ID. Each
|
||||
| they can look it up in the #[code StringStore] via its hash value. Each
|
||||
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
|
||||
| contains the #[strong context-independent] information about a word.
|
||||
| For example, no matter if "love" is used as a verb or a noun in some
|
||||
| context, its spelling and whether it consists of alphabetic characters
|
||||
| won't ever change.
|
||||
| won't ever change. Its hash value will also always be the same.
|
||||
|
||||
+code.
|
||||
for word in doc:
|
||||
|
@ -56,39 +56,54 @@ p
|
|||
|
||||
+aside
|
||||
| #[strong Text]: The original text of the lexeme.#[br]
|
||||
| #[strong Orth]: The integer ID of the lexeme.#[br]
|
||||
| #[strong Orth]: The hash value of the lexeme.#[br]
|
||||
| #[strong Shape]: The abstract word shape of the lexeme.#[br]
|
||||
| #[strong Prefix]: By default, the first letter of the word string.#[br]
|
||||
| #[strong Suffix]: By default, the last three letters of the word string.#[br]
|
||||
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
|
||||
| #[strong is digit]: Does the lexeme consist of digits?#[br]
|
||||
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
|
||||
| #[strong Lang]: The language of the parent vocabulary.
|
||||
|
||||
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
|
||||
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
|
||||
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
|
||||
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
|
||||
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
|
||||
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
|
||||
- var style = [0, 1, 1, 0, 0, 1, 1]
|
||||
+annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style)
|
||||
+annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style)
|
||||
+annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style)
|
||||
|
||||
p
|
||||
| The specific entries in the voabulary and their IDs don't really matter –
|
||||
| #[strong as long as they match]. That's why you always need to make sure
|
||||
| all objects you create have access to the same vocabulary. If they don't,
|
||||
| the IDs won't match and spaCy will either produce very confusing results,
|
||||
| or fail alltogether.
|
||||
| The mapping of words to hashes doesn't depend on any state. To make sure
|
||||
| each value is unique, spaCy uses a
|
||||
| #[+a("https://en.wikipedia.org/wiki/Hash_function") hash function] to
|
||||
| calculate the hash #[strong based on the word string]. This also means
|
||||
| that the hash for "coffee" will always be the same, no matter which model
|
||||
| you're using or how you've configured spaCy.
|
||||
|
||||
p
|
||||
| However, hashes #[strong cannot be reversed] and there's no way to
|
||||
| resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do
|
||||
| is look it up in the vocabulary. That's why you always need to make
|
||||
| sure all objects you create have access to the same vocabulary. If they
|
||||
| don't, spaCy might not be able to find the strings it needs.
|
||||
|
||||
+code.
|
||||
from spacy.tokens import Doc
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
doc = nlp(u'I like coffee') # original Doc
|
||||
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
|
||||
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
|
||||
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
|
||||
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash
|
||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
||||
|
||||
empty_doc = Doc(Vocab()) # new Doc with empty Vocab
|
||||
# doc.vocab.strings[3197928453018144401L] will raise an error :(
|
||||
|
||||
empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
|
||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
||||
|
||||
new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
|
||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
|
||||
|
||||
p
|
||||
| Even though both #[code Doc] objects contain the same words, the internal
|
||||
| integer IDs are very different. The same applies for all other strings,
|
||||
| like the annotation scheme. To avoid mismatched IDs, spaCy will always
|
||||
| export the vocab if you save a #[code Doc] or #[code nlp] object.
|
||||
| If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will
|
||||
| throw an error. So you either need to add it manually, or initialise the
|
||||
| new #[code Doc] with the shared vocab. To prevent this problem, spaCy
|
||||
| will ususally export the vocab when you save a #[code Doc] or #[code nlp]
|
||||
| object.
|
||||
|
|
|
@ -68,13 +68,19 @@ p
|
|||
| #[strong API:] #[+api("token") #[code Token]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
|
||||
|
||||
+h(2, "examples-integer-ids") Use integer IDs for any string
|
||||
+h(2, "examples-hashes") Use hash values for any string
|
||||
|
||||
+code.
|
||||
hello_id = nlp.vocab.strings['Hello']
|
||||
hello_str = nlp.vocab.strings[hello_id]
|
||||
assert token.text == hello_id == 3125
|
||||
assert token.text == hello_str == 'Hello'
|
||||
doc = nlp(u'I love coffee')
|
||||
coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L
|
||||
coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
|
||||
|
||||
assert doc[2].orth == coffee_hash == 3197928453018144401L
|
||||
assert doc[2].text == coffee_text == u'coffee'
|
||||
|
||||
doc.vocab.strings.add(u'beer')
|
||||
beer_hash = doc.vocab.strings[u'beer'] # 3073001599257881079L
|
||||
beer_text = doc.vocab.strings[beer_hash] # 'beer'
|
||||
|
||||
+h(2, "examples-entities") Recongnise and update named entities
|
||||
+tag-model("NER")
|
||||
|
|
|
@ -50,6 +50,28 @@ p
|
|||
| #[strong API:] #[+api("language") #[code Language]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
|
||||
|
||||
+h(3, "features-hash-ids") Hash values instead of integer IDs
|
||||
|
||||
+aside-code("Example").
|
||||
doc = nlp(u'I love coffee')
|
||||
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
|
||||
assert doc.vocab.strings[3197928453018144401L] == u'coffee'
|
||||
|
||||
doc.vocab.strings.add(u'beer')
|
||||
assert doc.vocab.strings[u'beer'] == 3073001599257881079L
|
||||
|
||||
p
|
||||
| The #[+api("stringstore") #[code StringStore]] now resolves all strings
|
||||
| to hash values instead of integer IDs. This means that the string-to-int
|
||||
| mapping #[strong no longer depends on the vocabulary state], making a lot
|
||||
| of workflows much simpler, especially during training. Unlike integer IDs
|
||||
| in spaCy v1.x, hash values will #[strong always match] – even across
|
||||
| models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method.
|
||||
|
||||
+infobox
|
||||
| #[strong API:] #[+api("stringstore") #[code StringStore]]
|
||||
| #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
|
||||
|
||||
+h(3, "features-serializer") Saving, loading and serialization
|
||||
|
||||
+aside-code("Example").
|
||||
|
@ -307,6 +329,17 @@ p
|
|||
nlp.save_to_directory('/model')
|
||||
nlp.vocab.dump('/vocab')
|
||||
|
||||
+h(3, "migrating-strings") Strings and hash values
|
||||
|
||||
+code-new.
|
||||
nlp.vocab.strings.add(u'coffee')
|
||||
nlp.vocab.strings[u'coffee'] # 3197928453018144401L
|
||||
other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L
|
||||
|
||||
+code-old.
|
||||
nlp.vocab.strings[u'coffee'] # 3672
|
||||
other_nlp.vocab.strings[u'coffee'] # 40259
|
||||
|
||||
+h(3, "migrating-languages") Processing pipelines and language data
|
||||
|
||||
p
|
||||
|
|
|
@ -97,7 +97,7 @@ include _includes/_mixins
|
|||
+item Part-of-speech tagging
|
||||
+item #[strong Named entity] recognition
|
||||
+item Labelled dependency parsing
|
||||
+item Convenient string-to-int mapping
|
||||
+item Convenient string-to-hash mapping
|
||||
+item Export to numpy data arrays
|
||||
+item GIL-free #[strong multi-threading]
|
||||
+item Efficient binary serialization
|
||||
|
|
Loading…
Reference in New Issue
Block a user