Update docs to reflect StringStore changes

This commit is contained in:
ines 2017-05-28 18:19:11 +02:00
parent 89bf635cbe
commit 414193e9ba
7 changed files with 142 additions and 56 deletions

View File

@ -74,9 +74,9 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string return string
cdef class StringStore: cdef class StringStore:
"""Lookup strings by 64-bit hash""" """Look up strings by 64-bit hashes."""
def __init__(self, strings=None, freeze=False): def __init__(self, strings=None, freeze=False):
"""Create the StringStore. """Create the StringStore.
@ -92,9 +92,9 @@ cdef class StringStore:
self.add(string) self.add(string)
def __getitem__(self, object string_or_id): def __getitem__(self, object string_or_id):
"""Retrieve a string from a given hash ID, or vice versa. """Retrieve a string from a given hash, or vice versa.
string_or_id (bytes or unicode or uint64): The value to encode. string_or_id (bytes, unicode or uint64): The value to encode.
Returns (unicode or uint64): The value to be retrieved. Returns (unicode or uint64): The value to be retrieved.
""" """
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
@ -123,6 +123,11 @@ cdef class StringStore:
return decode_Utf8Str(utf8str) return decode_Utf8Str(utf8str)
def add(self, string): def add(self, string):
"""Add a string to the StringStore.
string (unicode): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, unicode): if isinstance(string, unicode):
if string in SYMBOLS_BY_STR: if string in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string] return SYMBOLS_BY_STR[string]

View File

@ -7,30 +7,30 @@
</style> </style>
<rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/> <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text> <text class="svg__vocab__text" dx="-0.5em" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">31979...</text>
<rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text> <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text> <text class="svg__vocab__text" dx="-0.5em" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">46904...</text>
<rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text> <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text> <text class="svg__vocab__text" dx="-0.7em" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">37020...</text>
<rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text> <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
<rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/> <rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
<text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text> <text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
<rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text> <text class="svg__vocab__text-box" dx="-0.5em" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">31979…</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
<text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text> <text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
<rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">508</text> <text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12" transform="translate(114.5 266.5)">46904…</text>
<path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/> <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
<text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text> <text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
<rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/> <rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
<text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text> <text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">37020…</text>
<rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/> <rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
<path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/> <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
<path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/> <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>

Before

Width:  |  Height:  |  Size: 7.8 KiB

After

Width:  |  Height:  |  Size: 7.9 KiB

View File

@ -2,14 +2,16 @@
include ../../_includes/_mixins include ../../_includes/_mixins
p Map strings to and from integer IDs. p
| Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values
| instead of integer IDs. This ensures that strings always map to the
| same ID, even from different #[code StringStores].
+h(2, "init") StringStore.__init__ +h(2, "init") StringStore.__init__
+tag method +tag method
p p
| Create the #[code StringStore]. Note that a newly initialised store will | Create the #[code StringStore].
| always include an empty string #[code ''] at position #[code 0].
+aside-code("Example"). +aside-code("Example").
from spacy.strings import StringStore from spacy.strings import StringStore
@ -44,17 +46,18 @@ p Get the number of strings in the store.
+h(2, "getitem") StringStore.__getitem__ +h(2, "getitem") StringStore.__getitem__
+tag method +tag method
p Retrieve a string from a given integer ID, or vice versa. p Retrieve a string from a given hash, or vice versa.
+aside-code("Example"). +aside-code("Example").
stringstore = StringStore([u'apple', u'orange']) stringstore = StringStore([u'apple', u'orange'])
int_id = stringstore[u'apple'] # 1 apple_hash = stringstore[u'apple']
assert stringstore[int_id] == u'apple' assert apple_hash == 8566208034543834098L
assert stringstore[apple_hash] == u'apple'
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+row +row
+cell #[code string_or_id] +cell #[code string_or_id]
+cell bytes, unicode or int +cell bytes, unicode or uint64
+cell The value to encode. +cell The value to encode.
+footrow +footrow
@ -94,7 +97,7 @@ p
+aside-code("Example"). +aside-code("Example").
stringstore = StringStore([u'apple', u'orange']) stringstore = StringStore([u'apple', u'orange'])
all_strings = [s for s in stringstore] all_strings = [s for s in stringstore]
assert all_strings == [u'', u'apple', u'orange'] assert all_strings == [u'apple', u'orange']
+table(["Name", "Type", "Description"]) +table(["Name", "Type", "Description"])
+footrow +footrow
@ -102,6 +105,30 @@ p
+cell unicode +cell unicode
+cell A string in the store. +cell A string in the store.
+h(2, "add") StringStore.add
+tag method
+tag-new(2)
p Add a string to the #[code StringStore].
+aside-code("Example").
stringstore = StringStore([u'apple', u'orange'])
stringstore.add(u'banana')
assert len(stringstore) == 3
assert stringstore[u'banana'] == 2525716904149915114L
+table(["Name", "Type", "Description"])
+row
+cell #[code string]
+cell unicode
+cell The string to add.
+footrow
+cell returns
+cell uint64
+cell The string's hash value.
+h(2, "to_disk") StringStore.to_disk +h(2, "to_disk") StringStore.to_disk
+tag method +tag method
+tag-new(2) +tag-new(2)

View File

@ -4,10 +4,10 @@ p
| Whenever possible, spaCy tries to store data in a vocabulary, the | Whenever possible, spaCy tries to store data in a vocabulary, the
| #[+api("vocab") #[code Vocab]], that will be | #[+api("vocab") #[code Vocab]], that will be
| #[strong shared by multiple documents]. To save memory, spaCy also | #[strong shared by multiple documents]. To save memory, spaCy also
| encodes all strings to #[strong integer IDs] in this case for example, | encodes all strings to #[strong hash values] in this case for example,
| "coffee" has the ID #[code 3672]. Entity labels like "ORG" and | "coffee" has the hash #[code 3197928453018144401L]. Entity labels like
| part-of-speech tags like "VERB" are also encoded. Internally, spaCy | "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
| only "speaks" in integer IDs. | spaCy only "speaks" in hash values.
+aside +aside
| #[strong Token]: A word, punctuation mark etc. #[em in context], including | #[strong Token]: A word, punctuation mark etc. #[em in context], including
@ -16,8 +16,8 @@ p
| and flags, e.g. if it's lowercase, a digit or punctuation.#[br] | and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
| #[strong Doc]: A processed container of tokens in context.#[br] | #[strong Doc]: A processed container of tokens in context.#[br]
| #[strong Vocab]: The collection of lexemes.#[br] | #[strong Vocab]: The collection of lexemes.#[br]
| #[strong StringStore]: The dictionary mapping integer IDs to strings, for | #[strong StringStore]: The dictionary mapping hash values to strings, for
| example #[code 3672] &rarr; "coffee". | example #[code 3197928453018144401L] &rarr; "coffee".
+image +image
include ../../../assets/img/docs/vocab_stringstore.svg include ../../../assets/img/docs/vocab_stringstore.svg
@ -27,26 +27,26 @@ p
p p
| If you process lots of documents containing the word "coffee" in all | If you process lots of documents containing the word "coffee" in all
| kinds of different contexts, storing the exact string "coffee" every time | kinds of different contexts, storing the exact string "coffee" every time
| would take up way too much space. So instead, spaCy assigns it an ID | would take up way too much space. So instead, spaCy hashes the string
| and stores it in the #[+api("stringstore") #[code StringStore]]. You can | and stores it in the #[+api("stringstore") #[code StringStore]]. You can
| think of the #[code StringStore] as a | think of the #[code StringStore] as a
| #[strong lookup table that works in both directions] you can look up a | #[strong lookup table that works in both directions] you can look up a
| string to get its ID, or an ID to get its string: | string to get its hash, or a hash to get its string:
+code. +code.
doc = nlp(u'I like coffee') doc = nlp(u'I like coffee')
assert doc.vocab.strings[u'coffee'] == 3572 assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
assert doc.vocab.strings[3572] == u'coffee' assert doc.vocab.strings[3197928453018144401L] == u'coffee'
p p
| Now that all strings are encoded, the entries in the vocabulary | Now that all strings are encoded, the entries in the vocabulary
| #[strong don&apos;t need to include the word text] themselves. Instead, | #[strong don&apos;t need to include the word text] themselves. Instead,
| they can look it up in the #[code StringStore] via its integer ID. Each | they can look it up in the #[code StringStore] via its hash value. Each
| entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]], | entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
| contains the #[strong context-independent] information about a word. | contains the #[strong context-independent] information about a word.
| For example, no matter if "love" is used as a verb or a noun in some | For example, no matter if "love" is used as a verb or a noun in some
| context, its spelling and whether it consists of alphabetic characters | context, its spelling and whether it consists of alphabetic characters
| won't ever change. | won't ever change. Its hash value will also always be the same.
+code. +code.
for word in doc: for word in doc:
@ -56,39 +56,54 @@ p
+aside +aside
| #[strong Text]: The original text of the lexeme.#[br] | #[strong Text]: The original text of the lexeme.#[br]
| #[strong Orth]: The integer ID of the lexeme.#[br] | #[strong Orth]: The hash value of the lexeme.#[br]
| #[strong Shape]: The abstract word shape of the lexeme.#[br] | #[strong Shape]: The abstract word shape of the lexeme.#[br]
| #[strong Prefix]: By default, the first letter of the word string.#[br] | #[strong Prefix]: By default, the first letter of the word string.#[br]
| #[strong Suffix]: By default, the last three letters of the word string.#[br] | #[strong Suffix]: By default, the last three letters of the word string.#[br]
| #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br] | #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong is digit]: Does the lexeme consist of digits?#[br] | #[strong is digit]: Does the lexeme consist of digits?#[br]
| #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
| #[strong Lang]: The language of the parent vocabulary.
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"]) +table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
- var style = [0, 1, 1, 0, 0, 1, 1, 1, 0] - var style = [0, 1, 1, 0, 0, 1, 1]
+annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style) +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style)
+annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style) +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style)
+annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style) +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style)
p p
| The specific entries in the voabulary and their IDs don't really matter | The mapping of words to hashes doesn't depend on any state. To make sure
| #[strong as long as they match]. That's why you always need to make sure | each value is unique, spaCy uses a
| all objects you create have access to the same vocabulary. If they don't, | #[+a("https://en.wikipedia.org/wiki/Hash_function") hash function] to
| the IDs won't match and spaCy will either produce very confusing results, | calculate the hash #[strong based on the word string]. This also means
| or fail alltogether. | that the hash for "coffee" will always be the same, no matter which model
| you're using or how you've configured spaCy.
p
| However, hashes #[strong cannot be reversed] and there's no way to
| resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do
| is look it up in the vocabulary. That's why you always need to make
| sure all objects you create have access to the same vocabulary. If they
| don't, spaCy might not be able to find the strings it needs.
+code. +code.
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
doc = nlp(u'I like coffee') # original Doc doc = nlp(u'I like coffee') # original Doc
new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash
assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
empty_doc = Doc(Vocab()) # new Doc with empty Vocab
# doc.vocab.strings[3197928453018144401L] will raise an error :(
empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
p p
| Even though both #[code Doc] objects contain the same words, the internal | If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will
| integer IDs are very different. The same applies for all other strings, | throw an error. So you either need to add it manually, or initialise the
| like the annotation scheme. To avoid mismatched IDs, spaCy will always | new #[code Doc] with the shared vocab. To prevent this problem, spaCy
| export the vocab if you save a #[code Doc] or #[code nlp] object. | will ususally export the vocab when you save a #[code Doc] or #[code nlp]
| object.

View File

@ -68,13 +68,19 @@ p
| #[strong API:] #[+api("token") #[code Token]] | #[strong API:] #[+api("token") #[code Token]]
| #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging] | #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
+h(2, "examples-integer-ids") Use integer IDs for any string +h(2, "examples-hashes") Use hash values for any string
+code. +code.
hello_id = nlp.vocab.strings['Hello'] doc = nlp(u'I love coffee')
hello_str = nlp.vocab.strings[hello_id] coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L
assert token.text == hello_id == 3125 coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
assert token.text == hello_str == 'Hello'
assert doc[2].orth == coffee_hash == 3197928453018144401L
assert doc[2].text == coffee_text == u'coffee'
doc.vocab.strings.add(u'beer')
beer_hash = doc.vocab.strings[u'beer'] # 3073001599257881079L
beer_text = doc.vocab.strings[beer_hash] # 'beer'
+h(2, "examples-entities") Recongnise and update named entities +h(2, "examples-entities") Recongnise and update named entities
+tag-model("NER") +tag-model("NER")

View File

@ -50,6 +50,28 @@ p
| #[strong API:] #[+api("language") #[code Language]] | #[strong API:] #[+api("language") #[code Language]]
| #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text] | #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
+h(3, "features-hash-ids") Hash values instead of integer IDs
+aside-code("Example").
doc = nlp(u'I love coffee')
assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
assert doc.vocab.strings[3197928453018144401L] == u'coffee'
doc.vocab.strings.add(u'beer')
assert doc.vocab.strings[u'beer'] == 3073001599257881079L
p
| The #[+api("stringstore") #[code StringStore]] now resolves all strings
| to hash values instead of integer IDs. This means that the string-to-int
| mapping #[strong no longer depends on the vocabulary state], making a lot
| of workflows much simpler, especially during training. Unlike integer IDs
| in spaCy v1.x, hash values will #[strong always match] even across
| models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method.
+infobox
| #[strong API:] #[+api("stringstore") #[code StringStore]]
| #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
+h(3, "features-serializer") Saving, loading and serialization +h(3, "features-serializer") Saving, loading and serialization
+aside-code("Example"). +aside-code("Example").
@ -307,6 +329,17 @@ p
nlp.save_to_directory('/model') nlp.save_to_directory('/model')
nlp.vocab.dump('/vocab') nlp.vocab.dump('/vocab')
+h(3, "migrating-strings") Strings and hash values
+code-new.
nlp.vocab.strings.add(u'coffee')
nlp.vocab.strings[u'coffee'] # 3197928453018144401L
other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L
+code-old.
nlp.vocab.strings[u'coffee'] # 3672
other_nlp.vocab.strings[u'coffee'] # 40259
+h(3, "migrating-languages") Processing pipelines and language data +h(3, "migrating-languages") Processing pipelines and language data
p p

View File

@ -97,7 +97,7 @@ include _includes/_mixins
+item Part-of-speech tagging +item Part-of-speech tagging
+item #[strong Named entity] recognition +item #[strong Named entity] recognition
+item Labelled dependency parsing +item Labelled dependency parsing
+item Convenient string-to-int mapping +item Convenient string-to-hash mapping
+item Export to numpy data arrays +item Export to numpy data arrays
+item GIL-free #[strong multi-threading] +item GIL-free #[strong multi-threading]
+item Efficient binary serialization +item Efficient binary serialization