Update docs to reflect StringStore changes

2025-10-28 22:47:52 +03:00 · 2017-05-28 18:19:11 +02:00 · 2017-05-28 18:19:11 +02:00 · 414193e9ba
commit 414193e9ba
parent 89bf635cbe
7 changed files with 142 additions and 56 deletions
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -74,9 +74,9 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
        assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
        return string
- 
+
 cdef class StringStore:
-    """Lookup strings by 64-bit hash"""
+    """Look up strings by 64-bit hashes."""
    def __init__(self, strings=None, freeze=False):
        """Create the StringStore.
@ -92,9 +92,9 @@ cdef class StringStore:
                self.add(string)
    def __getitem__(self, object string_or_id):
-        """Retrieve a string from a given hash ID, or vice versa.
+        """Retrieve a string from a given hash, or vice versa.
-        string_or_id (bytes or unicode or uint64): The value to encode.
+        string_or_id (bytes, unicode or uint64): The value to encode.
        Returns (unicode or uint64): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
@ -123,6 +123,11 @@ cdef class StringStore:
                return decode_Utf8Str(utf8str)
    def add(self, string):
        """Add a string to the StringStore.
        string (unicode): The string to add.
        RETURNS (uint64): The string's hash value.
        """
        if isinstance(string, unicode):
            if string in SYMBOLS_BY_STR:
                return SYMBOLS_BY_STR[string]
--- a/website/assets/img/docs/vocab_stringstore.svg
+++ b/website/assets/img/docs/vocab_stringstore.svg
@ -7,30 +7,30 @@
    </style>
    <rect width="570" height="88" x="1" y="135" fill="#d5e8d4" stroke="#82b366" stroke-width="2" rx="13.2" ry="13.2"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 164h100v40H444z"/>
-    <text class="svg__vocab__text" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">3572</text>
+    <text class="svg__vocab__text" dx="-0.5em" dy="1em" transform="translate(477.5 174.5)" width="31" height="17">31979...</text>
    <rect width="52" height="20" x="468" y="152" fill="#666" rx="3" ry="3"/>
    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(471.5 155.5)">Lexeme</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 164h100v40H76z"/>
-    <text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">508</text>
+    <text class="svg__vocab__text" dx="-0.5em" dy="1em" width="23" height="17" transform="translate(113.5 174.5)">46904...</text>
    <rect width="52" height="20" x="100" y="152" fill="#666" rx="3" ry="3"/>
    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(103.5 155.5)">Lexeme</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 164h100v40H263z"/>
-    <text class="svg__vocab__text" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">949</text>
+    <text class="svg__vocab__text" dx="-0.7em" dy="1em" width="23" height="17" transform="translate(300.5 174.5)">37020...</text>
    <rect width="52" height="20" x="287" y="152" fill="#666" rx="3" ry="3"/>
    <text class="svg__vocab__text-box" dy="0.9em" width="44" height="12" transform="translate(290.5 155.5)">Lexeme</text>
    <rect width="570" height="88" x="1" y="246" fill="#f5f5f5" stroke="#666" stroke-width="2" rx="13.2" ry="13.2"/>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M444 275h100v40H444z"/>
    <text class="svg__vocab__text" dy="1em" width="55" height="17" transform="translate(465.5 285.5)">&quot;coffee&quot;</text>
    <rect width="52" height="20" x="468" y="263" fill="#666" rx="3" ry="3"/>
-    <text class="svg__vocab__text-box" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">3672</text>
+    <text class="svg__vocab__text-box" dx="-0.5em" dy="0.9em" width="28" height="12" transform="translate(479.5 266.5)">31979…</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M76 275h100v40H76z"/>
    <text class="svg__vocab__text" dy="1em" width="17" height="17" transform="translate(116.5 285.5)">&quot;I&quot;</text>
    <rect width="52" height="20" x="100" y="263" fill="#666" rx="3" ry="3"/>
-    <text class="svg__vocab__text-box" dy="0.9em" width="22" height="12"  transform="translate(114.5 266.5)">508</text>
+    <text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12"  transform="translate(114.5 266.5)">46904…</text>
    <path fill="#f5f5f5" stroke="#666" stroke-width="2" d="M263 275h100v40H263z"/>
    <text class="svg__vocab__text" dy="1em" width="41" height="17" transform="translate(291.5 285.5)">&quot;love&quot;</text>
    <rect width="52" height="20" x="287" y="263" fill="#666" rx="3" ry="3"/>
-    <text class="svg__vocab__text-box" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">949</text>
+    <text class="svg__vocab__text-box" dx="-0.7em" dy="0.9em" width="22" height="12" transform="translate(301.5 266.5)">37020…</text>
    <rect width="570" height="110" x="1" y="1" fill="#e1d5e7" stroke="#9673a6" stroke-width="2" rx="16.5" ry="16.5"/>
    <path fill="none" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M263 60h-78.8"/>
    <path fill="#999" stroke="#999" stroke-width="2" stroke-miterlimit="10" d="M178.2 60l8-4-2 4 2 4z"/>
--- a/website/docs/api/stringstore.jade
+++ b/website/docs/api/stringstore.jade
@ -2,14 +2,16 @@
 include ../../_includes/_mixins
-p Map strings to and from integer IDs.
+p
    |  Look up strings by 64-bit hashes. As of v2.0, spaCy uses hash values
    |  instead of integer IDs. This ensures that strings always map to the
    |  same ID, even from different #[code StringStores].
 +h(2, "init") StringStore.__init__
    +tag method
 p
-    |  Create the #[code StringStore]. Note that a newly initialised store will
+    |  Create the #[code StringStore].
    |  always include an empty string #[code ''] at position #[code 0].
 +aside-code("Example").
    from spacy.strings import StringStore
@ -44,17 +46,18 @@ p Get the number of strings in the store.
 +h(2, "getitem") StringStore.__getitem__
    +tag method
-p Retrieve a string from a given integer ID, or vice versa.
+p Retrieve a string from a given hash, or vice versa.
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
-    int_id = stringstore[u'apple'] # 1
+    apple_hash = stringstore[u'apple']
-    assert stringstore[int_id] == u'apple'
+    assert apple_hash == 8566208034543834098L
    assert stringstore[apple_hash] == u'apple'
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string_or_id]
-        +cell bytes, unicode or int
+        +cell bytes, unicode or uint64
        +cell The value to encode.
    +footrow
@ -94,7 +97,7 @@ p
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    all_strings = [s for s in stringstore]
-    assert all_strings == [u'', u'apple', u'orange']
+    assert all_strings == [u'apple', u'orange']
 +table(["Name", "Type", "Description"])
    +footrow
@ -102,6 +105,30 @@ p
        +cell unicode
        +cell A string in the store.
 +h(2, "add") StringStore.add
    +tag method
    +tag-new(2)
 p Add a string to the #[code StringStore].
 +aside-code("Example").
    stringstore = StringStore([u'apple', u'orange'])
    stringstore.add(u'banana')
    assert len(stringstore) == 3
    assert stringstore[u'banana'] == 2525716904149915114L
 +table(["Name", "Type", "Description"])
    +row
        +cell #[code string]
        +cell unicode
        +cell The string to add.
    +footrow
        +cell returns
        +cell uint64
        +cell The string's hash value.
 +h(2, "to_disk") StringStore.to_disk
    +tag method
    +tag-new(2)
--- a/website/docs/usage/_spacy-101/_vocab.jade
+++ b/website/docs/usage/_spacy-101/_vocab.jade
@ -4,10 +4,10 @@ p
    |  Whenever possible, spaCy tries to store data in a vocabulary, the
    |  #[+api("vocab") #[code Vocab]], that will be
    |  #[strong shared by multiple documents]. To save memory, spaCy also
-    |  encodes all strings to #[strong integer IDs] – in this case for example,
+    |  encodes all strings to #[strong hash values] – in this case for example,
-    |  "coffee" has the ID #[code 3672]. Entity labels like "ORG" and
+    |  "coffee" has the hash #[code 3197928453018144401L]. Entity labels like
-    |  part-of-speech tags like "VERB" are also encoded. Internally, spaCy
+    |  "ORG" and part-of-speech tags like "VERB" are also encoded. Internally,
-    |  only "speaks" in integer IDs.
+    |  spaCy only "speaks" in hash values.
 +aside
    |  #[strong Token]: A word, punctuation mark etc. #[em in context], including
@ -16,8 +16,8 @@ p
    |  and flags, e.g. if it's lowercase, a digit or punctuation.#[br]
    |  #[strong Doc]: A processed container of tokens in context.#[br]
    |  #[strong Vocab]: The collection of lexemes.#[br]
-    |  #[strong StringStore]: The dictionary mapping integer IDs to strings, for
+    |  #[strong StringStore]: The dictionary mapping hash values to strings, for
-    |  example #[code 3672] &rarr; "coffee".
+    |  example #[code 3197928453018144401L] &rarr; "coffee".
 +image
    include ../../../assets/img/docs/vocab_stringstore.svg
@ -27,26 +27,26 @@ p
 p
    |  If you process lots of documents containing the word "coffee" in all
    |  kinds of different contexts, storing the exact string "coffee" every time
-    |  would take up way too much space. So instead, spaCy assigns it an ID
+    |  would take up way too much space. So instead, spaCy hashes the string
    |  and stores it in the #[+api("stringstore") #[code StringStore]]. You can
    |  think of the #[code StringStore] as a
    |  #[strong lookup table that works in both directions] – you can look up a
-    |  string to get its ID, or an ID to get its string:
+    |  string to get its hash, or a hash to get its string:
 +code.
    doc = nlp(u'I like coffee')
-    assert doc.vocab.strings[u'coffee'] == 3572
+    assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
-    assert doc.vocab.strings[3572] == u'coffee'
+    assert doc.vocab.strings[3197928453018144401L] == u'coffee'
 p
    |  Now that all strings are encoded, the entries in the vocabulary
    |  #[strong don&apos;t need to include the word text] themselves. Instead,
-    |  they can look it up in the #[code StringStore] via its integer ID. Each
+    |  they can look it up in the #[code StringStore] via its hash value. Each
    |  entry in the vocabulary, also called #[+api("lexeme") #[code Lexeme]],
    |  contains the #[strong context-independent] information about a word.
    |  For example, no matter if "love" is used as a verb or a noun in some
    |  context, its spelling and whether it consists of alphabetic characters
-    |  won't ever change.
+    |  won't ever change. Its hash value will also always be the same.
 +code.
    for word in doc:
@ -56,39 +56,54 @@ p
 +aside
    |  #[strong Text]: The original text of the lexeme.#[br]
-    |  #[strong Orth]: The integer ID of the lexeme.#[br]
+    |  #[strong Orth]: The hash value of the lexeme.#[br]
    |  #[strong Shape]: The abstract word shape of the lexeme.#[br]
    |  #[strong Prefix]: By default, the first letter of the word string.#[br]
    |  #[strong Suffix]: By default, the last three letters of the word string.#[br]
    |  #[strong is alpha]: Does the lexeme consist of alphabetic characters?#[br]
    |  #[strong is digit]: Does the lexeme consist of digits?#[br]
    |  #[strong is title]: Does the lexeme consist of alphabetic characters?#[br]
    |  #[strong Lang]: The language of the parent vocabulary.
-+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit", "is_title", "lang"])
+table(["text", "orth", "shape", "prefix", "suffix", "is_alpha", "is_digit"])
-    - var style = [0, 1, 1, 0, 0, 1, 1, 1, 0]
+    - var style = [0, 1, 1, 0, 0, 1, 1]
-    +annotation-row(["I", 508, "X", "I", "I", true, false, true, "en"], style)
+    +annotation-row(["I", "4690420944186131903L", "X", "I", "I", true, false], style)
-    +annotation-row(["love", 949, "xxxx", "l", "ove", true, false, false, "en"], style)
+    +annotation-row(["love", "3702023516439754181L", "xxxx", "l", "ove", true, false], style)
-    +annotation-row(["coffee", 3572, "xxxx", "c", "ffe", true, false, false, "en"], style)
+    +annotation-row(["coffee", "3197928453018144401L", "xxxx", "c", "ffe", true, false], style)
 p
-    |  The specific entries in the voabulary and their IDs don't really matter –
+    |  The mapping of words to hashes doesn't depend on any state. To make sure
-    |  #[strong as long as they match]. That's why you always need to make sure
+    |  each value is unique, spaCy uses a
-    |  all objects you create have access to the same vocabulary. If they don't,
+    |  #[+a("https://en.wikipedia.org/wiki/Hash_function") hash function] to
-    |  the IDs won't match and spaCy will either produce very confusing results,
+    |  calculate the hash #[strong based on the word string]. This also means
-    |  or fail alltogether.
+    |  that the hash for "coffee" will always be the same, no matter which model
    |  you're using or how you've configured spaCy.
 p
    |  However, hashes #[strong cannot be reversed] and there's no way to
    |  resolve #[code 3197928453018144401L] back to "coffee". All spaCy can do
    |  is look it up in the vocabulary. That's why you always need to make
    |  sure all objects you create have access to the same vocabulary. If they
    |  don't, spaCy might not be able to find the strings it needs.
 +code.
    from spacy.tokens import Doc
    from spacy.vocab import Vocab
    doc = nlp(u'I like coffee') # original Doc
-    new_doc = Doc(Vocab(), words=['I', 'like', 'coffee']) # new Doc with empty Vocab
+    assert doc.vocab.strings[u'coffee'] == 3197928453018144401L # get hash
-    assert doc.vocab.strings[u'coffee'] == 3572 # ID in vocab of Doc
+    assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
-    assert new_doc.vocab.strings[u'coffee'] == 446 # ID in vocab of new Doc
+
    empty_doc = Doc(Vocab()) # new Doc with empty Vocab
    # doc.vocab.strings[3197928453018144401L] will raise an error :(
    empty_doc.vocab.strings.add(u'coffee') # add "coffee" and generate hash
    assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
    new_doc = Doc(doc.vocab) # create new doc with first doc's vocab
    assert doc.vocab.strings[3197928453018144401L] == u'coffee' # 👍
 p
-    |  Even though both #[code Doc] objects contain the same words, the internal
+    |  If the doc's vocabulary doesn't contain a hash for "coffee", spaCy will
-    |  integer IDs are very different. The same applies for all other strings,
+    |  throw an error. So you either need to add it manually, or initialise the
-    |  like the annotation scheme. To avoid mismatched IDs, spaCy will always
+    |  new #[code Doc] with the shared vocab. To prevent this problem, spaCy
-    |  export the vocab if you save a #[code Doc] or #[code nlp] object.
+    |  will ususally export the vocab when you save a #[code Doc] or #[code nlp]
    |  object.
--- a/website/docs/usage/lightning-tour.jade
+++ b/website/docs/usage/lightning-tour.jade
@ -68,13 +68,19 @@ p
    |  #[strong API:] #[+api("token") #[code Token]]
    |  #[strong Usage:] #[+a("/docs/usage/pos-tagging") Part-of-speech tagging]
-+h(2, "examples-integer-ids") Use integer IDs for any string
+h(2, "examples-hashes") Use hash values for any string
 +code.
-    hello_id = nlp.vocab.strings['Hello']
+    doc = nlp(u'I love coffee')
-    hello_str = nlp.vocab.strings[hello_id]
+    coffee_hash = nlp.vocab.strings[u'coffee'] # 3197928453018144401L
-    assert token.text  == hello_id  == 3125
+    coffee_text = nlp.vocab.strings[coffee_hash] # 'coffee'
-    assert token.text == hello_str == 'Hello'
+
    assert doc[2].orth == coffee_hash == 3197928453018144401L
    assert doc[2].text == coffee_text == u'coffee'
    doc.vocab.strings.add(u'beer')
    beer_hash = doc.vocab.strings[u'beer'] # 3073001599257881079L
    beer_text = doc.vocab.strings[beer_hash] # 'beer'
 +h(2, "examples-entities") Recongnise and update named entities
    +tag-model("NER")
--- a/website/docs/usage/v2.jade
+++ b/website/docs/usage/v2.jade
@ -50,6 +50,28 @@ p
    |  #[strong API:] #[+api("language") #[code Language]]
    |  #[strong Usage:] #[+a("/docs/usage/language-processing-pipeline") Processing text]
 +h(3, "features-hash-ids") Hash values instead of integer IDs
 +aside-code("Example").
    doc = nlp(u'I love coffee')
    assert doc.vocab.strings[u'coffee'] == 3197928453018144401L
    assert doc.vocab.strings[3197928453018144401L] == u'coffee'
    doc.vocab.strings.add(u'beer')
    assert doc.vocab.strings[u'beer'] == 3073001599257881079L
 p
    |  The #[+api("stringstore") #[code StringStore]] now resolves all strings
    |  to hash values instead of integer IDs. This means that the string-to-int
    |  mapping #[strong no longer depends on the vocabulary state], making a lot
    |  of workflows much simpler, especially during training. Unlike integer IDs
    |  in spaCy v1.x, hash values will #[strong always match] – even across
    |  models. Strings can now be added explicitly using the new #[+api("stringstore#add") #[code Stringstore.add]] method.
 +infobox
    |  #[strong API:] #[+api("stringstore") #[code StringStore]]
    |  #[strong Usage:] #[+a("/docs/usage/spacy-101#vocab") Vocab, hashes and lexemes 101]
 +h(3, "features-serializer") Saving, loading and serialization
 +aside-code("Example").
@ -307,6 +329,17 @@ p
    nlp.save_to_directory('/model')
    nlp.vocab.dump('/vocab')
 +h(3, "migrating-strings") Strings and hash values
 +code-new.
    nlp.vocab.strings.add(u'coffee')
    nlp.vocab.strings[u'coffee']       # 3197928453018144401L
    other_nlp.vocab.strings[u'coffee'] # 3197928453018144401L
 +code-old.
    nlp.vocab.strings[u'coffee']       # 3672
    other_nlp.vocab.strings[u'coffee'] # 40259
 +h(3, "migrating-languages") Processing pipelines and language data
 p
--- a/website/index.jade
+++ b/website/index.jade
@ -97,7 +97,7 @@ include _includes/_mixins
                +item Part-of-speech tagging
                +item #[strong Named entity] recognition
                +item Labelled dependency parsing
-                +item Convenient string-to-int mapping
+                +item Convenient string-to-hash mapping
                +item Export to numpy data arrays
                +item GIL-free #[strong multi-threading]
                +item Efficient binary serialization