mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			106 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			106 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
//- ----------------------------------
 | 
						|
//- 💫 DOCS > API > STRINGSTORE
 | 
						|
//- ----------------------------------
 | 
						|
 | 
						|
+section("stringstore")
 | 
						|
    +h(2, "stringstore", "https://github.com/" + SOCIAL.github + "/spaCy/blob/master/spacy/strings.pyx")
 | 
						|
        | #[+tag class] StringStore
 | 
						|
 | 
						|
    p Intern strings, and map them to sequential integer IDs.
 | 
						|
 | 
						|
    p.
 | 
						|
        Only the integer IDs are held by spaCy's data
 | 
						|
        classes (#[code Doc], #[code Token], #[code Span] and #[code Lexeme])
 | 
						|
        – when you use a string-valued attribute like #[code token.orth_],
 | 
						|
        you access a property that computes #[code token.strings[token.orth]].
 | 
						|
 | 
						|
        +aside("Efficiency").
 | 
						|
            The mapping table is very efficient , and a small-string optimization
 | 
						|
            is used to maintain a small memory footprint.
 | 
						|
 | 
						|
 | 
						|
    +table(["Usage", "Description"])
 | 
						|
        +row
 | 
						|
            +cell #[code string = string_store[int_id]]
 | 
						|
            +cell.
 | 
						|
                Retrieve a string from a given integer ID. If the integer ID
 | 
						|
                is not found, raise #[code IndexError].
 | 
						|
 | 
						|
        +row
 | 
						|
            +cell #[code int_id = string_store[unicode_string]]
 | 
						|
            +cell.
 | 
						|
                Map a unicode string to an integer ID. If the string is
 | 
						|
                previously unseen, it is interned, and a new ID is returned.
 | 
						|
 | 
						|
        +row
 | 
						|
            +cell #[code int_id = string_store[utf8_byte_string]]
 | 
						|
            +cell.
 | 
						|
                Byte strings are assumed to be in UTF-8 encoding. Strings
 | 
						|
                encoded with other codecs may fail silently. Given a utf8
 | 
						|
                string, the behaviour is the same as for unicode strings.
 | 
						|
                Internally, strings are stored in UTF-8 format. So if you start
 | 
						|
                with a UTF-8 byte string, it's less efficient to first decode
 | 
						|
                it as unicode, as StringStore will then have to encode it as
 | 
						|
                UTF-8 once again.
 | 
						|
 | 
						|
        +row
 | 
						|
            +cell #[code n_strings = len(string_store)]
 | 
						|
            +cell.
 | 
						|
                Number of strings in the string-store.
 | 
						|
 | 
						|
        +row
 | 
						|
            +cell #[code for string in string_store]
 | 
						|
            +cell
 | 
						|
                p.
 | 
						|
                    Iterate over strings in the string store, in order, such
 | 
						|
                    that the ith string in the sequence has the ID #[code i]:
 | 
						|
 | 
						|
                +code.code-block-small.no-block.
 | 
						|
                    string_store = doc.vocab.strings
 | 
						|
                    for i, string in enumerate(string_store):
 | 
						|
                        assert i == string_store[string]
 | 
						|
 | 
						|
    +section("stringstore-init")
 | 
						|
        +h(3, "stringstore-init")
 | 
						|
            | #[+tag method] StringStore.__init__
 | 
						|
 | 
						|
        +code("python", "Definition").
 | 
						|
            def __init__(self):
 | 
						|
                return self
 | 
						|
 | 
						|
    +section("stringstore-dump")
 | 
						|
        +h(3, "stringstore-dump")
 | 
						|
            | #[+tag method] StringStore.dump
 | 
						|
 | 
						|
        p Save the string-to-int mapping to the given file.
 | 
						|
 | 
						|
        +code("python", "Definition").
 | 
						|
            def dump(self, file):
 | 
						|
                return None
 | 
						|
 | 
						|
        +table(["Name", "Type", "Description"])
 | 
						|
            +row
 | 
						|
                +cell loc
 | 
						|
                +cell str
 | 
						|
                +cell.
 | 
						|
                    The file to write the data to.
 | 
						|
 | 
						|
    +section("stringstore-load")
 | 
						|
        +h(3, "stringstore-load")
 | 
						|
            | #[+tag method] StringStore.load
 | 
						|
 | 
						|
        p Load the strings from the given file.
 | 
						|
 | 
						|
        +code("python", "Definition").
 | 
						|
            def load(self, file):
 | 
						|
                return None
 | 
						|
 | 
						|
        +table(["Name", "Type", "Description"])
 | 
						|
            +row
 | 
						|
                +cell file
 | 
						|
                +cell file
 | 
						|
                +cell.
 | 
						|
                    File-like object to load the data from. The format is subject
 | 
						|
                    to change; so if you need to read/write compatible files, please
 | 
						|
                    find details in the strings.pyx source.
 |