* Update doc strings

This commit is contained in:
Matthew Honnibal 2014-08-21 03:29:15 +02:00
parent cbda38e2d9
commit 248cbb6d07

View File

@ -14,6 +14,10 @@ cimport spacy
from spacy.orthography.latin cimport * from spacy.orthography.latin cimport *
from spacy.lexeme cimport *
from .orthography.latin import *
from .lexeme import *
@ -61,34 +65,33 @@ EN = English('en')
cpdef Tokens tokenize(unicode string): cpdef Tokens tokenize(unicode string):
"""Tokenize a string. """Tokenize a string.
Wraps EN.tokenize, where EN is an instance of the class English. The global The tokenization rules are defined in two places:
variable manages the vocabulary, and memoizes tokenization rules.
* The data/en/tokenization table, which handles special cases like contractions;
* The `spacy.en.English.find_split` function, which is used to split off punctuation etc.
Args: Args:
string (unicode): The string to be split. Must be unicode, not bytes. string (unicode): The string to be tokenized.
Returns: Returns:
tokens (Tokens): A Tokens instance, managing a vector of pointers to tokens (Tokens): A Tokens object, giving access to a sequence of LexIDs.
Lexeme structs. The Tokens instance supports sequence interfaces,
but also offers a range of sequence-level operations, which are computed
efficiently in Cython-space.
""" """
return EN.tokenize(string) return EN.tokenize(string)
cpdef Lexeme_addr lookup(unicode string) except 0: # +49 151 4336 2587
"""Retrieve (or create) a Lexeme for a string.
Returns a Lexeme ID, which can be used via the accessor
methods in spacy.lexeme cpdef LexID lookup(unicode string) except 0:
"""Retrieve (or create, if not found) a Lexeme ID for a string.
The LexID is really a memory address, making dereferencing it essentially free.
Args: Args:
string (unicode): The string to be looked up. Must be unicode, not bytes. string (unicode): The string to be looked up. Must be unicode, not bytes.
Returns: Returns:
LexemeID (size_t): An unsigned integer that allows the Lexeme to be retrieved. lexeme (LexID): A reference to a lexical type.
The LexemeID is really a memory address, making dereferencing it essentially
free.
""" """
return <Lexeme_addr>EN.lookup(string) return <Lexeme_addr>EN.lookup(string)
@ -101,7 +104,7 @@ cpdef unicode unhash(StringHash hash_value):
although no control is taken for hash collisions. although no control is taken for hash collisions.
Args: Args:
hash_value (uint32_t): The hash of a string, returned by Python's hash() hash_value (StringHash): The hash of a string, returned by Python's hash()
function. function.
Returns: Returns: