mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
Use consistent formatting for docstrings
This commit is contained in:
parent
d13f0a7017
commit
561f2a3eb4
|
@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
|
|||
|
||||
|
||||
class CLI(object):
|
||||
"""Command-line interface for spaCy"""
|
||||
|
||||
"""
|
||||
Command-line interface for spaCy
|
||||
"""
|
||||
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -29,7 +30,6 @@ class CLI(object):
|
|||
can be shortcut, model name or, if --direct flag is set, full model name
|
||||
with version.
|
||||
"""
|
||||
|
||||
cli_download(model, direct)
|
||||
|
||||
|
||||
|
@ -44,7 +44,6 @@ class CLI(object):
|
|||
either the name of a pip package, or the local path to the model data
|
||||
directory. Linking models allows loading them via spacy.load(link_name).
|
||||
"""
|
||||
|
||||
cli_link(origin, link_name, force)
|
||||
|
||||
|
||||
|
@ -58,7 +57,6 @@ class CLI(object):
|
|||
speficied as an argument, print model information. Flag --markdown
|
||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||
"""
|
||||
|
||||
cli_info(model, markdown)
|
||||
|
||||
|
||||
|
@ -73,7 +71,6 @@ class CLI(object):
|
|||
installation files. A new directory will be created in the specified
|
||||
output directory, and model data will be copied over.
|
||||
"""
|
||||
|
||||
cli_package(input_dir, output_dir, force)
|
||||
|
||||
|
||||
|
@ -93,7 +90,6 @@ class CLI(object):
|
|||
"""
|
||||
Train a model. Expects data in spaCy's JSON format.
|
||||
"""
|
||||
|
||||
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
||||
not no_parser, not no_ner, parser_L1)
|
||||
|
||||
|
@ -108,7 +104,6 @@ class CLI(object):
|
|||
"""
|
||||
Initialize a new model and its data directory.
|
||||
"""
|
||||
|
||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -122,7 +117,6 @@ class CLI(object):
|
|||
Convert files into JSON format for use with train command and other
|
||||
experiment management functions.
|
||||
"""
|
||||
|
||||
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||
|
||||
|
||||
|
|
|
@ -92,7 +92,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
|||
|
||||
|
||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||
'''Normalize a dictionary of attributes, converting them to ints.
|
||||
"""
|
||||
Normalize a dictionary of attributes, converting them to ints.
|
||||
|
||||
Arguments:
|
||||
stringy_attrs (dict):
|
||||
|
@ -105,7 +106,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
|||
inty_attrs (dict):
|
||||
Attributes dictionary with keys and optionally values converted to
|
||||
ints.
|
||||
'''
|
||||
"""
|
||||
inty_attrs = {}
|
||||
if _do_deprecated:
|
||||
if 'F' in stringy_attrs:
|
||||
|
|
|
@ -7,7 +7,8 @@ from ... import util
|
|||
|
||||
|
||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||
"""Convert conllu files into JSON format for use with train cli.
|
||||
"""
|
||||
Convert conllu files into JSON format for use with train cli.
|
||||
use_morphology parameter enables appending morphology to tags, which is
|
||||
useful for languages such as Spanish, where UD tags are not so rich.
|
||||
"""
|
||||
|
|
|
@ -36,7 +36,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
|
|||
|
||||
|
||||
def detokenize(token_rules, words): # Deprecated?
|
||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
"""
|
||||
To align with treebanks, return a list of "chunks", where a chunk is a
|
||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||
chunk should be a tuple of token indices, e.g.
|
||||
|
||||
|
@ -57,10 +58,13 @@ def detokenize(token_rules, words): # Deprecated?
|
|||
return positions
|
||||
|
||||
|
||||
def fix_glove_vectors_loading(overrides):
|
||||
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
||||
<1.0 stuff. Phase this out once the data is fixed."""
|
||||
|
||||
|
||||
def fix_glove_vectors_loading(overrides):
|
||||
"""
|
||||
Special-case hack for loading the GloVe vectors, to support deprecated
|
||||
<1.0 stuff. Phase this out once the data is fixed.
|
||||
"""
|
||||
if 'data_dir' in overrides and 'path' not in overrides:
|
||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||
if overrides.get('path') is False:
|
||||
|
@ -88,13 +92,13 @@ def fix_glove_vectors_loading(overrides):
|
|||
|
||||
|
||||
def resolve_model_name(name):
|
||||
"""If spaCy is loaded with 'de', check if symlink already exists. If
|
||||
"""
|
||||
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||
not, user have upgraded from older version and have old models installed.
|
||||
Check if old model directory exists and if so, return that instead and create
|
||||
shortcut link. If English model is found and no shortcut exists, raise error
|
||||
and tell user to install new model.
|
||||
"""
|
||||
|
||||
if name == 'en' or name == 'de':
|
||||
versions = ['1.0.0', '1.1.0']
|
||||
data_path = Path(util.get_data_path())
|
||||
|
@ -117,9 +121,11 @@ def resolve_model_name(name):
|
|||
|
||||
|
||||
class ModelDownload():
|
||||
"""Replace download modules within en and de with deprecation warning and
|
||||
"""
|
||||
Replace download modules within en and de with deprecation warning and
|
||||
download default language model (using shortcut). Use classmethods to allow
|
||||
importing ModelDownload as download and calling download.en() etc."""
|
||||
importing ModelDownload as download and calling download.en() etc.
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def load(self, lang):
|
||||
|
|
|
@ -220,7 +220,8 @@ cdef class GoldParse:
|
|||
|
||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||
deps=None, entities=None, make_projective=False):
|
||||
"""Create a GoldParse.
|
||||
"""
|
||||
Create a GoldParse.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
@ -310,13 +311,16 @@ cdef class GoldParse:
|
|||
|
||||
@property
|
||||
def is_projective(self):
|
||||
"""Whether the provided syntactic annotations form a projective dependency
|
||||
tree."""
|
||||
"""
|
||||
Whether the provided syntactic annotations form a projective dependency
|
||||
tree.
|
||||
"""
|
||||
return not nonproj.is_nonproj_tree(self.heads)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities):
|
||||
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
"""
|
||||
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||
scheme (biluo).
|
||||
|
||||
Arguments:
|
||||
|
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
|
|||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
|
||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||
'''
|
||||
"""
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx+len(token): token.i for token in doc}
|
||||
biluo = ['-' for _ in doc]
|
||||
|
|
|
@ -202,9 +202,10 @@ class BaseDefaults(object):
|
|||
|
||||
|
||||
class Language(object):
|
||||
'''A text-processing pipeline. Usually you'll load this once per process, and
|
||||
"""
|
||||
A text-processing pipeline. Usually you'll load this once per process, and
|
||||
pass the instance around your program.
|
||||
'''
|
||||
"""
|
||||
Defaults = BaseDefaults
|
||||
lang = None
|
||||
|
||||
|
@ -342,7 +343,8 @@ class Language(object):
|
|||
return doc
|
||||
|
||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||
'''Process texts as a stream, and yield Doc objects in order.
|
||||
"""
|
||||
Process texts as a stream, and yield Doc objects in order.
|
||||
|
||||
Supports GIL-free multi-threading.
|
||||
|
||||
|
@ -351,7 +353,7 @@ class Language(object):
|
|||
tag (bool)
|
||||
parse (bool)
|
||||
entity (bool)
|
||||
'''
|
||||
"""
|
||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||
stream = (self.make_doc(text) for text in texts)
|
||||
for proc in self.pipeline:
|
||||
|
|
|
@ -38,8 +38,10 @@ class Lemmatizer(object):
|
|||
return lemmas
|
||||
|
||||
def is_base_form(self, univ_pos, morphology=None):
|
||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.'''
|
||||
"""
|
||||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
"""
|
||||
morphology = {} if morphology is None else morphology
|
||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||
true_morph_key = morphology.get('morph', 0)
|
||||
|
|
|
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
|||
|
||||
|
||||
cdef class Lexeme:
|
||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
"""
|
||||
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||
tag).
|
||||
"""
|
||||
def __init__(self, Vocab vocab, int orth):
|
||||
"""Create a Lexeme object.
|
||||
"""
|
||||
Create a Lexeme object.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab): The parent vocabulary
|
||||
|
@ -80,7 +82,8 @@ cdef class Lexeme:
|
|||
return self.c.orth
|
||||
|
||||
def set_flag(self, attr_id_t flag_id, bint value):
|
||||
"""Change the value of a boolean flag.
|
||||
"""
|
||||
Change the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to set.
|
||||
|
@ -89,7 +92,8 @@ cdef class Lexeme:
|
|||
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||
|
||||
def check_flag(self, attr_id_t flag_id):
|
||||
"""Check the value of a boolean flag.
|
||||
"""
|
||||
Check the value of a boolean flag.
|
||||
|
||||
Arguments:
|
||||
flag_id (int): The attribute ID of the flag to query.
|
||||
|
@ -98,7 +102,8 @@ cdef class Lexeme:
|
|||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||
|
||||
def similarity(self, other):
|
||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
"""
|
||||
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||
|
||||
Arguments:
|
||||
other:
|
||||
|
@ -106,7 +111,7 @@ cdef class Lexeme:
|
|||
Token and Lexeme objects.
|
||||
Returns:
|
||||
score (float): A scalar similarity score. Higher is more similar.
|
||||
'''
|
||||
"""
|
||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||
return 0.0
|
||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||
|
|
|
@ -180,7 +180,8 @@ cdef class Matcher:
|
|||
|
||||
@classmethod
|
||||
def load(cls, path, vocab):
|
||||
'''Load the matcher and patterns from a file path.
|
||||
"""
|
||||
Load the matcher and patterns from a file path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
|
@ -189,7 +190,7 @@ cdef class Matcher:
|
|||
The vocabulary that the documents to match over will refer to.
|
||||
Returns:
|
||||
Matcher: The newly constructed object.
|
||||
'''
|
||||
"""
|
||||
if (path / 'gazetteer.json').exists():
|
||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||
patterns = json.load(file_)
|
||||
|
@ -198,7 +199,8 @@ cdef class Matcher:
|
|||
return cls(vocab, patterns)
|
||||
|
||||
def __init__(self, vocab, patterns={}):
|
||||
"""Create the Matcher.
|
||||
"""
|
||||
Create the Matcher.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
|
@ -227,7 +229,8 @@ cdef class Matcher:
|
|||
|
||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||
acceptor=None, on_match=None):
|
||||
"""Add an entity to the matcher.
|
||||
"""
|
||||
Add an entity to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
|
@ -264,7 +267,8 @@ cdef class Matcher:
|
|||
self._callbacks[entity_key] = on_match
|
||||
|
||||
def add_pattern(self, entity_key, token_specs, label=""):
|
||||
"""Add a pattern to the matcher.
|
||||
"""
|
||||
Add a pattern to the matcher.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int):
|
||||
|
@ -307,7 +311,8 @@ cdef class Matcher:
|
|||
return entity_key
|
||||
|
||||
def has_entity(self, entity_key):
|
||||
"""Check whether the matcher has an entity.
|
||||
"""
|
||||
Check whether the matcher has an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (string or int): The entity key to check.
|
||||
|
@ -318,7 +323,8 @@ cdef class Matcher:
|
|||
return entity_key in self._entities
|
||||
|
||||
def get_entity(self, entity_key):
|
||||
"""Retrieve the attributes stored for an entity.
|
||||
"""
|
||||
Retrieve the attributes stored for an entity.
|
||||
|
||||
Arguments:
|
||||
entity_key (unicode or int): The entity to retrieve.
|
||||
|
@ -332,7 +338,8 @@ cdef class Matcher:
|
|||
return None
|
||||
|
||||
def __call__(self, Doc doc, acceptor=None):
|
||||
"""Find all token sequences matching the supplied patterns on the Doc.
|
||||
"""
|
||||
Find all token sequences matching the supplied patterns on the Doc.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
@ -445,7 +452,8 @@ cdef class Matcher:
|
|||
return matches
|
||||
|
||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||
"""Match a stream of documents, yielding them in turn.
|
||||
"""
|
||||
Match a stream of documents, yielding them in turn.
|
||||
|
||||
Arguments:
|
||||
docs: A stream of documents.
|
||||
|
|
|
@ -16,7 +16,9 @@ from .attrs import LEMMA, intify_attrs
|
|||
|
||||
|
||||
def _normalize_props(props):
|
||||
'''Transform deprecated string keys to correct names.'''
|
||||
"""
|
||||
Transform deprecated string keys to correct names.
|
||||
"""
|
||||
out = {}
|
||||
for key, value in props.items():
|
||||
if key == POS:
|
||||
|
@ -98,13 +100,14 @@ cdef class Morphology:
|
|||
flags[0] &= ~(one << flag_id)
|
||||
|
||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||
'''Add a special-case rule to the morphological analyser. Tokens whose
|
||||
"""
|
||||
Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
Arguments:
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
'''
|
||||
"""
|
||||
tag = self.strings[tag_str]
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
|
|
|
@ -11,7 +11,9 @@ from .attrs import DEP, ENT_TYPE
|
|||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
@ -28,7 +30,9 @@ cdef class EntityRecognizer(Parser):
|
|||
|
||||
|
||||
cdef class BeamEntityRecognizer(BeamParser):
|
||||
"""Annotate named entities on Doc objects."""
|
||||
"""
|
||||
Annotate named entities on Doc objects.
|
||||
"""
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
feature_templates = get_feature_templates('ner')
|
||||
|
|
|
@ -6,7 +6,9 @@ from .gold import tags_to_entities
|
|||
|
||||
|
||||
class PRFScore(object):
|
||||
"""A precision / recall / F score"""
|
||||
"""
|
||||
A precision / recall / F score
|
||||
"""
|
||||
def __init__(self):
|
||||
self.tp = 0
|
||||
self.fp = 0
|
||||
|
|
|
@ -73,13 +73,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
|||
|
||||
|
||||
cdef class StringStore:
|
||||
'''Map strings to and from integer IDs.'''
|
||||
"""
|
||||
Map strings to and from integer IDs.
|
||||
"""
|
||||
def __init__(self, strings=None, freeze=False):
|
||||
'''Create the StringStore.
|
||||
"""
|
||||
Create the StringStore.
|
||||
|
||||
Arguments:
|
||||
strings: A sequence of unicode strings to add to the store.
|
||||
'''
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap()
|
||||
self._oov = PreshMap()
|
||||
|
@ -104,7 +107,8 @@ cdef class StringStore:
|
|||
return (StringStore, (list(self),))
|
||||
|
||||
def __len__(self):
|
||||
"""The number of strings in the store.
|
||||
"""
|
||||
The number of strings in the store.
|
||||
|
||||
Returns:
|
||||
int The number of strings in the store.
|
||||
|
@ -112,8 +116,9 @@ cdef class StringStore:
|
|||
return self.size-1
|
||||
|
||||
def __getitem__(self, object string_or_id):
|
||||
"""Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
"""
|
||||
Retrieve a string from a given integer ID, or vice versa.
|
||||
|
||||
Arguments:
|
||||
string_or_id (bytes or unicode or int):
|
||||
The value to encode.
|
||||
|
@ -159,7 +164,8 @@ cdef class StringStore:
|
|||
return utf8str - self.c
|
||||
|
||||
def __contains__(self, unicode string not None):
|
||||
"""Check whether a string is in the store.
|
||||
"""
|
||||
Check whether a string is in the store.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to check.
|
||||
|
@ -172,7 +178,8 @@ cdef class StringStore:
|
|||
return self._map.get(key) is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
"""
|
||||
Iterate over the strings in the store, in order.
|
||||
|
||||
Yields: unicode A string in the store.
|
||||
"""
|
||||
|
@ -230,7 +237,8 @@ cdef class StringStore:
|
|||
return &self.c[self.size-1]
|
||||
|
||||
def dump(self, file_):
|
||||
"""Save the strings to a JSON file.
|
||||
"""
|
||||
Save the strings to a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to save the strings.
|
||||
|
@ -244,7 +252,8 @@ cdef class StringStore:
|
|||
file_.write(string_data)
|
||||
|
||||
def load(self, file_):
|
||||
"""Load the strings from a JSON file.
|
||||
"""
|
||||
Load the strings from a JSON file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file from which to load the strings.
|
||||
|
|
|
@ -106,10 +106,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
|||
|
||||
|
||||
cdef class Tagger:
|
||||
"""Annotate part-of-speech tags on Doc objects."""
|
||||
"""
|
||||
Annotate part-of-speech tags on Doc objects.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, vocab, require=False):
|
||||
"""Load the statistical model from the supplied path.
|
||||
"""
|
||||
Load the statistical model from the supplied path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
|
@ -142,7 +145,8 @@ cdef class Tagger:
|
|||
return self
|
||||
|
||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||
"""Create a Tagger.
|
||||
"""
|
||||
Create a Tagger.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
|
@ -180,7 +184,8 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def __call__(self, Doc tokens):
|
||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
||||
"""
|
||||
Apply the tagger, setting the POS tags onto the Doc object.
|
||||
|
||||
Arguments:
|
||||
doc (Doc): The tokens to be tagged.
|
||||
|
@ -208,7 +213,8 @@ cdef class Tagger:
|
|||
tokens._py_tokens = [None] * tokens.length
|
||||
|
||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||
"""Tag a stream of documents.
|
||||
"""
|
||||
Tag a stream of documents.
|
||||
|
||||
Arguments:
|
||||
stream: The sequence of documents to tag.
|
||||
|
@ -225,7 +231,8 @@ cdef class Tagger:
|
|||
yield doc
|
||||
|
||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||
"""Update the statistical model, with tags supplied for the given document.
|
||||
"""
|
||||
Update the statistical model, with tags supplied for the given document.
|
||||
|
||||
Arguments:
|
||||
doc (Doc):
|
||||
|
|
|
@ -23,12 +23,15 @@ from .tokens.doc cimport Doc
|
|||
|
||||
|
||||
cdef class Tokenizer:
|
||||
"""Segment text, and create Doc objects with the discovered segment boundaries."""
|
||||
"""
|
||||
Segment text, and create Doc objects with the discovered segment boundaries.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||
infix_finditer=None, token_match=None):
|
||||
'''Load a Tokenizer, reading unsupplied components from the path.
|
||||
|
||||
"""
|
||||
Load a Tokenizer, reading unsupplied components from the path.
|
||||
|
||||
Arguments:
|
||||
path (Path):
|
||||
The path to load from.
|
||||
|
@ -45,10 +48,10 @@ cdef class Tokenizer:
|
|||
infix_finditer:
|
||||
Signature of re.compile(string).finditer
|
||||
Returns Tokenizer
|
||||
'''
|
||||
if isinstance(path, basestring):
|
||||
path = pathlib.Path(path)
|
||||
|
||||
"""
|
||||
if rules is None:
|
||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||
rules = json.load(file_)
|
||||
|
@ -67,8 +70,9 @@ cdef class Tokenizer:
|
|||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
||||
|
||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||
'''Create a Tokenizer, to create Doc objects given unicode text.
|
||||
|
||||
"""
|
||||
Create a Tokenizer, to create Doc objects given unicode text.
|
||||
|
||||
Arguments:
|
||||
vocab (Vocab):
|
||||
A storage container for lexical types.
|
||||
|
@ -85,7 +89,7 @@ cdef class Tokenizer:
|
|||
to find infixes.
|
||||
token_match:
|
||||
A boolean function matching strings that becomes tokens.
|
||||
'''
|
||||
"""
|
||||
self.mem = Pool()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
|
@ -117,7 +121,8 @@ cdef class Tokenizer:
|
|||
|
||||
@cython.boundscheck(False)
|
||||
def __call__(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
"""
|
||||
Tokenize a string.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to tokenize.
|
||||
|
@ -170,7 +175,8 @@ cdef class Tokenizer:
|
|||
return tokens
|
||||
|
||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||
"""Tokenize a stream of texts.
|
||||
"""
|
||||
Tokenize a stream of texts.
|
||||
|
||||
Arguments:
|
||||
texts: A sequence of unicode texts.
|
||||
|
@ -324,7 +330,8 @@ cdef class Tokenizer:
|
|||
self._cache.set(key, cached)
|
||||
|
||||
def find_infix(self, unicode string):
|
||||
"""Find internal split points of the string, such as hyphens.
|
||||
"""
|
||||
Find internal split points of the string, such as hyphens.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
|
||||
|
@ -337,7 +344,8 @@ cdef class Tokenizer:
|
|||
return list(self.infix_finditer(string))
|
||||
|
||||
def find_prefix(self, unicode string):
|
||||
"""Find the length of a prefix that should be segmented from the string,
|
||||
"""
|
||||
Find the length of a prefix that should be segmented from the string,
|
||||
or None if no prefix rules match.
|
||||
|
||||
Arguments:
|
||||
|
@ -350,7 +358,8 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def find_suffix(self, unicode string):
|
||||
"""Find the length of a suffix that should be segmented from the string,
|
||||
"""
|
||||
Find the length of a suffix that should be segmented from the string,
|
||||
or None if no suffix rules match.
|
||||
|
||||
Arguments:
|
||||
|
@ -363,13 +372,15 @@ cdef class Tokenizer:
|
|||
return (match.end() - match.start()) if match is not None else 0
|
||||
|
||||
def _load_special_tokenization(self, special_cases):
|
||||
'''Add special-case tokenization rules.
|
||||
'''
|
||||
"""
|
||||
Add special-case tokenization rules.
|
||||
"""
|
||||
for chunk, substrings in sorted(special_cases.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
def add_special_case(self, unicode string, substrings):
|
||||
'''Add a special-case tokenization rule.
|
||||
"""
|
||||
Add a special-case tokenization rule.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The string to specially tokenize.
|
||||
|
@ -378,7 +389,7 @@ cdef class Tokenizer:
|
|||
attributes. The ORTH fields of the attributes must exactly match
|
||||
the string when they are concatenated.
|
||||
Returns None
|
||||
'''
|
||||
"""
|
||||
substrings = list(substrings)
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.length = len(substrings)
|
||||
|
|
|
@ -9,7 +9,9 @@ from .gold import merge_sents
|
|||
|
||||
|
||||
class Trainer(object):
|
||||
'''Manage training of an NLP pipeline.'''
|
||||
"""
|
||||
Manage training of an NLP pipeline.
|
||||
"""
|
||||
def __init__(self, nlp, gold_tuples):
|
||||
self.nlp = nlp
|
||||
self.gold_tuples = gold_tuples
|
||||
|
|
|
@ -48,8 +48,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
|||
|
||||
|
||||
cdef class Vocab:
|
||||
'''A map container for a language's LexemeC structs.
|
||||
'''
|
||||
"""
|
||||
A map container for a language's LexemeC structs.
|
||||
"""
|
||||
@classmethod
|
||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||
|
@ -108,7 +109,8 @@ cdef class Vocab:
|
|||
|
||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||
'''Create the vocabulary.
|
||||
"""
|
||||
Create the vocabulary.
|
||||
|
||||
lex_attr_getters (dict):
|
||||
A dictionary mapping attribute IDs to functions to compute them.
|
||||
|
@ -123,7 +125,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
Vocab: The newly constructed vocab object.
|
||||
'''
|
||||
"""
|
||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
|
@ -172,17 +174,19 @@ cdef class Vocab:
|
|||
return langfunc('_') if langfunc else ''
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
"""
|
||||
The current number of lexemes stored.
|
||||
"""
|
||||
return self.length
|
||||
|
||||
def resize_vectors(self, int new_size):
|
||||
'''
|
||||
"""
|
||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||
vectors if necessary. The memory will be zeroed.
|
||||
|
||||
Arguments:
|
||||
new_size (int): The new size of the vectors.
|
||||
'''
|
||||
"""
|
||||
cdef hash_t key
|
||||
cdef size_t addr
|
||||
if new_size > self.vectors_length:
|
||||
|
@ -193,7 +197,8 @@ cdef class Vocab:
|
|||
self.vectors_length = new_size
|
||||
|
||||
def add_flag(self, flag_getter, int flag_id=-1):
|
||||
'''Set a new boolean flag to words in the vocabulary.
|
||||
"""
|
||||
Set a new boolean flag to words in the vocabulary.
|
||||
|
||||
The flag_setter function will be called over the words currently in the
|
||||
vocab, and then applied to new words as they occur. You'll then be able
|
||||
|
@ -213,7 +218,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
flag_id (int): The integer ID by which the flag value can be checked.
|
||||
'''
|
||||
"""
|
||||
if flag_id == -1:
|
||||
for bit in range(1, 64):
|
||||
if bit not in self.lex_attr_getters:
|
||||
|
@ -234,9 +239,11 @@ cdef class Vocab:
|
|||
return flag_id
|
||||
|
||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if string == u'':
|
||||
return &EMPTY_LEXEME
|
||||
cdef LexemeC* lex
|
||||
|
@ -252,9 +259,11 @@ cdef class Vocab:
|
|||
return self._new_lexeme(mem, string)
|
||||
|
||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
"""
|
||||
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||
if necessary, using memory acquired from the given pool. If the pool
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
||||
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||
"""
|
||||
if orth == 0:
|
||||
return &EMPTY_LEXEME
|
||||
cdef LexemeC* lex
|
||||
|
@ -297,30 +306,33 @@ cdef class Vocab:
|
|||
self.length += 1
|
||||
|
||||
def __contains__(self, unicode string):
|
||||
'''Check whether the string has an entry in the vocabulary.
|
||||
"""
|
||||
Check whether the string has an entry in the vocabulary.
|
||||
|
||||
Arguments:
|
||||
string (unicode): The ID string.
|
||||
|
||||
Returns:
|
||||
bool Whether the string has an entry in the vocabulary.
|
||||
'''
|
||||
"""
|
||||
key = hash_string(string)
|
||||
lex = self._by_hash.get(key)
|
||||
return lex is not NULL
|
||||
|
||||
def __iter__(self):
|
||||
'''Iterate over the lexemes in the vocabulary.
|
||||
"""
|
||||
Iterate over the lexemes in the vocabulary.
|
||||
|
||||
Yields: Lexeme An entry in the vocabulary.
|
||||
'''
|
||||
"""
|
||||
cdef attr_t orth
|
||||
cdef size_t addr
|
||||
for orth, addr in self._by_orth.items():
|
||||
yield Lexeme(self, orth)
|
||||
|
||||
def __getitem__(self, id_or_string):
|
||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
"""
|
||||
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||
unseen unicode string is given, a new lexeme is created and stored.
|
||||
|
||||
Arguments:
|
||||
|
@ -332,7 +344,7 @@ cdef class Vocab:
|
|||
|
||||
Returns:
|
||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||
'''
|
||||
"""
|
||||
cdef attr_t orth
|
||||
if type(id_or_string) == unicode:
|
||||
orth = self.strings[id_or_string]
|
||||
|
@ -355,7 +367,8 @@ cdef class Vocab:
|
|||
return tokens
|
||||
|
||||
def dump(self, loc=None):
|
||||
"""Save the lexemes binary data to the given location, or
|
||||
"""
|
||||
Save the lexemes binary data to the given location, or
|
||||
return a byte-string with the data if loc is None.
|
||||
|
||||
Arguments:
|
||||
|
@ -392,14 +405,15 @@ cdef class Vocab:
|
|||
return fp.string_data()
|
||||
|
||||
def load_lexemes(self, loc):
|
||||
'''Load the binary vocabulary data from the given location.
|
||||
"""
|
||||
Load the binary vocabulary data from the given location.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to load from.
|
||||
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
"""
|
||||
fp = CFile(loc, 'rb',
|
||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||
cdef LexemeC* lexeme = NULL
|
||||
|
@ -440,8 +454,9 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def _deserialize_lexemes(self, CFile fp):
|
||||
'''Load the binary vocabulary data from the given CFile.
|
||||
'''
|
||||
"""
|
||||
Load the binary vocabulary data from the given CFile.
|
||||
"""
|
||||
cdef LexemeC* lexeme = NULL
|
||||
cdef hash_t key
|
||||
cdef unicode py_str
|
||||
|
@ -494,13 +509,14 @@ cdef class Vocab:
|
|||
fp.close()
|
||||
|
||||
def dump_vectors(self, out_loc):
|
||||
'''Save the word vectors to a binary file.
|
||||
"""
|
||||
Save the word vectors to a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (Path): The path to save to.
|
||||
Returns:
|
||||
None
|
||||
'''
|
||||
"""
|
||||
cdef int32_t vec_len = self.vectors_length
|
||||
cdef int32_t word_len
|
||||
cdef bytes word_str
|
||||
|
@ -522,7 +538,8 @@ cdef class Vocab:
|
|||
out_file.close()
|
||||
|
||||
def load_vectors(self, file_):
|
||||
"""Load vectors from a text-based file.
|
||||
"""
|
||||
Load vectors from a text-based file.
|
||||
|
||||
Arguments:
|
||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||
|
@ -561,7 +578,8 @@ cdef class Vocab:
|
|||
return vec_len
|
||||
|
||||
def load_vectors_from_bin_loc(self, loc):
|
||||
"""Load vectors from the location of a binary file.
|
||||
"""
|
||||
Load vectors from the location of a binary file.
|
||||
|
||||
Arguments:
|
||||
loc (unicode): The path of the binary file to load from.
|
||||
|
|
Loading…
Reference in New Issue
Block a user