mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Update doc strings
This commit is contained in:
parent
cbda38e2d9
commit
248cbb6d07
35
spacy/en.pyx
35
spacy/en.pyx
|
@ -14,6 +14,10 @@ cimport spacy
|
||||||
|
|
||||||
|
|
||||||
from spacy.orthography.latin cimport *
|
from spacy.orthography.latin cimport *
|
||||||
|
from spacy.lexeme cimport *
|
||||||
|
|
||||||
|
from .orthography.latin import *
|
||||||
|
from .lexeme import *
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,34 +65,33 @@ EN = English('en')
|
||||||
cpdef Tokens tokenize(unicode string):
|
cpdef Tokens tokenize(unicode string):
|
||||||
"""Tokenize a string.
|
"""Tokenize a string.
|
||||||
|
|
||||||
Wraps EN.tokenize, where EN is an instance of the class English. The global
|
The tokenization rules are defined in two places:
|
||||||
variable manages the vocabulary, and memoizes tokenization rules.
|
|
||||||
|
* The data/en/tokenization table, which handles special cases like contractions;
|
||||||
|
* The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
string (unicode): The string to be split. Must be unicode, not bytes.
|
string (unicode): The string to be tokenized.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
tokens (Tokens): A Tokens instance, managing a vector of pointers to
|
tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
|
||||||
Lexeme structs. The Tokens instance supports sequence interfaces,
|
|
||||||
but also offers a range of sequence-level operations, which are computed
|
|
||||||
efficiently in Cython-space.
|
|
||||||
"""
|
"""
|
||||||
return EN.tokenize(string)
|
return EN.tokenize(string)
|
||||||
|
|
||||||
|
|
||||||
cpdef Lexeme_addr lookup(unicode string) except 0:
|
|
||||||
"""Retrieve (or create) a Lexeme for a string.
|
|
||||||
|
|
||||||
Returns a Lexeme ID, which can be used via the accessor
|
# +49 151 4336 2587
|
||||||
methods in spacy.lexeme
|
|
||||||
|
|
||||||
|
|
||||||
|
cpdef LexID lookup(unicode string) except 0:
|
||||||
|
"""Retrieve (or create, if not found) a Lexeme ID for a string.
|
||||||
|
|
||||||
|
The LexID is really a memory address, making dereferencing it essentially free.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
string (unicode): The string to be looked up. Must be unicode, not bytes.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved.
|
lexeme (LexID): A reference to a lexical type.
|
||||||
The LexemeID is really a memory address, making dereferencing it essentially
|
|
||||||
free.
|
|
||||||
"""
|
"""
|
||||||
return <Lexeme_addr>EN.lookup(string)
|
return <Lexeme_addr>EN.lookup(string)
|
||||||
|
|
||||||
|
@ -101,7 +104,7 @@ cpdef unicode unhash(StringHash hash_value):
|
||||||
although no control is taken for hash collisions.
|
although no control is taken for hash collisions.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
hash_value (uint32_t): The hash of a string, returned by Python's hash()
|
hash_value (StringHash): The hash of a string, returned by Python's hash()
|
||||||
function.
|
function.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user