mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Use consistent formatting for docstrings
This commit is contained in:
parent
d13f0a7017
commit
561f2a3eb4
|
@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert
|
||||||
|
|
||||||
|
|
||||||
class CLI(object):
|
class CLI(object):
|
||||||
"""Command-line interface for spaCy"""
|
"""
|
||||||
|
Command-line interface for spaCy
|
||||||
|
"""
|
||||||
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert')
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -29,7 +30,6 @@ class CLI(object):
|
||||||
can be shortcut, model name or, if --direct flag is set, full model name
|
can be shortcut, model name or, if --direct flag is set, full model name
|
||||||
with version.
|
with version.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_download(model, direct)
|
cli_download(model, direct)
|
||||||
|
|
||||||
|
|
||||||
|
@ -44,7 +44,6 @@ class CLI(object):
|
||||||
either the name of a pip package, or the local path to the model data
|
either the name of a pip package, or the local path to the model data
|
||||||
directory. Linking models allows loading them via spacy.load(link_name).
|
directory. Linking models allows loading them via spacy.load(link_name).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_link(origin, link_name, force)
|
cli_link(origin, link_name, force)
|
||||||
|
|
||||||
|
|
||||||
|
@ -58,7 +57,6 @@ class CLI(object):
|
||||||
speficied as an argument, print model information. Flag --markdown
|
speficied as an argument, print model information. Flag --markdown
|
||||||
prints details in Markdown for easy copy-pasting to GitHub issues.
|
prints details in Markdown for easy copy-pasting to GitHub issues.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_info(model, markdown)
|
cli_info(model, markdown)
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,7 +71,6 @@ class CLI(object):
|
||||||
installation files. A new directory will be created in the specified
|
installation files. A new directory will be created in the specified
|
||||||
output directory, and model data will be copied over.
|
output directory, and model data will be copied over.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_package(input_dir, output_dir, force)
|
cli_package(input_dir, output_dir, force)
|
||||||
|
|
||||||
|
|
||||||
|
@ -93,7 +90,6 @@ class CLI(object):
|
||||||
"""
|
"""
|
||||||
Train a model. Expects data in spaCy's JSON format.
|
Train a model. Expects data in spaCy's JSON format.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger,
|
||||||
not no_parser, not no_ner, parser_L1)
|
not no_parser, not no_ner, parser_L1)
|
||||||
|
|
||||||
|
@ -108,7 +104,6 @@ class CLI(object):
|
||||||
"""
|
"""
|
||||||
Initialize a new model and its data directory.
|
Initialize a new model and its data directory.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data)
|
||||||
|
|
||||||
@plac.annotations(
|
@plac.annotations(
|
||||||
|
@ -122,7 +117,6 @@ class CLI(object):
|
||||||
Convert files into JSON format for use with train command and other
|
Convert files into JSON format for use with train command and other
|
||||||
experiment management functions.
|
experiment management functions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cli_convert(input_file, output_dir, n_sents, morphology)
|
cli_convert(input_file, output_dir, n_sents, morphology)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -92,7 +92,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
|
||||||
|
|
||||||
|
|
||||||
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
'''Normalize a dictionary of attributes, converting them to ints.
|
"""
|
||||||
|
Normalize a dictionary of attributes, converting them to ints.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stringy_attrs (dict):
|
stringy_attrs (dict):
|
||||||
|
@ -105,7 +106,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
|
||||||
inty_attrs (dict):
|
inty_attrs (dict):
|
||||||
Attributes dictionary with keys and optionally values converted to
|
Attributes dictionary with keys and optionally values converted to
|
||||||
ints.
|
ints.
|
||||||
'''
|
"""
|
||||||
inty_attrs = {}
|
inty_attrs = {}
|
||||||
if _do_deprecated:
|
if _do_deprecated:
|
||||||
if 'F' in stringy_attrs:
|
if 'F' in stringy_attrs:
|
||||||
|
|
|
@ -7,7 +7,8 @@ from ... import util
|
||||||
|
|
||||||
|
|
||||||
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
||||||
"""Convert conllu files into JSON format for use with train cli.
|
"""
|
||||||
|
Convert conllu files into JSON format for use with train cli.
|
||||||
use_morphology parameter enables appending morphology to tags, which is
|
use_morphology parameter enables appending morphology to tags, which is
|
||||||
useful for languages such as Spanish, where UD tags are not so rich.
|
useful for languages such as Spanish, where UD tags are not so rich.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -36,7 +36,8 @@ def align_tokens(ref, indices): # Deprecated, surely?
|
||||||
|
|
||||||
|
|
||||||
def detokenize(token_rules, words): # Deprecated?
|
def detokenize(token_rules, words): # Deprecated?
|
||||||
"""To align with treebanks, return a list of "chunks", where a chunk is a
|
"""
|
||||||
|
To align with treebanks, return a list of "chunks", where a chunk is a
|
||||||
sequence of tokens that are separated by whitespace in actual strings. Each
|
sequence of tokens that are separated by whitespace in actual strings. Each
|
||||||
chunk should be a tuple of token indices, e.g.
|
chunk should be a tuple of token indices, e.g.
|
||||||
|
|
||||||
|
@ -57,10 +58,13 @@ def detokenize(token_rules, words): # Deprecated?
|
||||||
return positions
|
return positions
|
||||||
|
|
||||||
|
|
||||||
def fix_glove_vectors_loading(overrides):
|
|
||||||
"""Special-case hack for loading the GloVe vectors, to support deprecated
|
|
||||||
<1.0 stuff. Phase this out once the data is fixed."""
|
|
||||||
|
|
||||||
|
|
||||||
|
def fix_glove_vectors_loading(overrides):
|
||||||
|
"""
|
||||||
|
Special-case hack for loading the GloVe vectors, to support deprecated
|
||||||
|
<1.0 stuff. Phase this out once the data is fixed.
|
||||||
|
"""
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||||
if overrides.get('path') is False:
|
if overrides.get('path') is False:
|
||||||
|
@ -88,13 +92,13 @@ def fix_glove_vectors_loading(overrides):
|
||||||
|
|
||||||
|
|
||||||
def resolve_model_name(name):
|
def resolve_model_name(name):
|
||||||
"""If spaCy is loaded with 'de', check if symlink already exists. If
|
"""
|
||||||
|
If spaCy is loaded with 'de', check if symlink already exists. If
|
||||||
not, user have upgraded from older version and have old models installed.
|
not, user have upgraded from older version and have old models installed.
|
||||||
Check if old model directory exists and if so, return that instead and create
|
Check if old model directory exists and if so, return that instead and create
|
||||||
shortcut link. If English model is found and no shortcut exists, raise error
|
shortcut link. If English model is found and no shortcut exists, raise error
|
||||||
and tell user to install new model.
|
and tell user to install new model.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if name == 'en' or name == 'de':
|
if name == 'en' or name == 'de':
|
||||||
versions = ['1.0.0', '1.1.0']
|
versions = ['1.0.0', '1.1.0']
|
||||||
data_path = Path(util.get_data_path())
|
data_path = Path(util.get_data_path())
|
||||||
|
@ -117,9 +121,11 @@ def resolve_model_name(name):
|
||||||
|
|
||||||
|
|
||||||
class ModelDownload():
|
class ModelDownload():
|
||||||
"""Replace download modules within en and de with deprecation warning and
|
"""
|
||||||
|
Replace download modules within en and de with deprecation warning and
|
||||||
download default language model (using shortcut). Use classmethods to allow
|
download default language model (using shortcut). Use classmethods to allow
|
||||||
importing ModelDownload as download and calling download.en() etc."""
|
importing ModelDownload as download and calling download.en() etc.
|
||||||
|
"""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(self, lang):
|
def load(self, lang):
|
||||||
|
|
|
@ -220,7 +220,8 @@ cdef class GoldParse:
|
||||||
|
|
||||||
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None,
|
||||||
deps=None, entities=None, make_projective=False):
|
deps=None, entities=None, make_projective=False):
|
||||||
"""Create a GoldParse.
|
"""
|
||||||
|
Create a GoldParse.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -310,13 +311,16 @@ cdef class GoldParse:
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def is_projective(self):
|
def is_projective(self):
|
||||||
"""Whether the provided syntactic annotations form a projective dependency
|
"""
|
||||||
tree."""
|
Whether the provided syntactic annotations form a projective dependency
|
||||||
|
tree.
|
||||||
|
"""
|
||||||
return not nonproj.is_nonproj_tree(self.heads)
|
return not nonproj.is_nonproj_tree(self.heads)
|
||||||
|
|
||||||
|
|
||||||
def biluo_tags_from_offsets(doc, entities):
|
def biluo_tags_from_offsets(doc, entities):
|
||||||
'''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
"""
|
||||||
|
Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out
|
||||||
scheme (biluo).
|
scheme (biluo).
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities):
|
||||||
tags = biluo_tags_from_offsets(doc, entities)
|
tags = biluo_tags_from_offsets(doc, entities)
|
||||||
|
|
||||||
assert tags == ['O', 'O', 'U-LOC', 'O']
|
assert tags == ['O', 'O', 'U-LOC', 'O']
|
||||||
'''
|
"""
|
||||||
starts = {token.idx: token.i for token in doc}
|
starts = {token.idx: token.i for token in doc}
|
||||||
ends = {token.idx+len(token): token.i for token in doc}
|
ends = {token.idx+len(token): token.i for token in doc}
|
||||||
biluo = ['-' for _ in doc]
|
biluo = ['-' for _ in doc]
|
||||||
|
|
|
@ -202,9 +202,10 @@ class BaseDefaults(object):
|
||||||
|
|
||||||
|
|
||||||
class Language(object):
|
class Language(object):
|
||||||
'''A text-processing pipeline. Usually you'll load this once per process, and
|
"""
|
||||||
|
A text-processing pipeline. Usually you'll load this once per process, and
|
||||||
pass the instance around your program.
|
pass the instance around your program.
|
||||||
'''
|
"""
|
||||||
Defaults = BaseDefaults
|
Defaults = BaseDefaults
|
||||||
lang = None
|
lang = None
|
||||||
|
|
||||||
|
@ -342,7 +343,8 @@ class Language(object):
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000):
|
||||||
'''Process texts as a stream, and yield Doc objects in order.
|
"""
|
||||||
|
Process texts as a stream, and yield Doc objects in order.
|
||||||
|
|
||||||
Supports GIL-free multi-threading.
|
Supports GIL-free multi-threading.
|
||||||
|
|
||||||
|
@ -351,7 +353,7 @@ class Language(object):
|
||||||
tag (bool)
|
tag (bool)
|
||||||
parse (bool)
|
parse (bool)
|
||||||
entity (bool)
|
entity (bool)
|
||||||
'''
|
"""
|
||||||
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity}
|
||||||
stream = (self.make_doc(text) for text in texts)
|
stream = (self.make_doc(text) for text in texts)
|
||||||
for proc in self.pipeline:
|
for proc in self.pipeline:
|
||||||
|
|
|
@ -38,8 +38,10 @@ class Lemmatizer(object):
|
||||||
return lemmas
|
return lemmas
|
||||||
|
|
||||||
def is_base_form(self, univ_pos, morphology=None):
|
def is_base_form(self, univ_pos, morphology=None):
|
||||||
'''Check whether we're dealing with an uninflected paradigm, so we can
|
"""
|
||||||
avoid lemmatization entirely.'''
|
Check whether we're dealing with an uninflected paradigm, so we can
|
||||||
|
avoid lemmatization entirely.
|
||||||
|
"""
|
||||||
morphology = {} if morphology is None else morphology
|
morphology = {} if morphology is None else morphology
|
||||||
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')]
|
||||||
true_morph_key = morphology.get('morph', 0)
|
true_morph_key = morphology.get('morph', 0)
|
||||||
|
|
|
@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
"""An entry in the vocabulary. A Lexeme has no string context --- it's a
|
"""
|
||||||
|
An entry in the vocabulary. A Lexeme has no string context --- it's a
|
||||||
word-type, as opposed to a word token. It therefore has no part-of-speech
|
word-type, as opposed to a word token. It therefore has no part-of-speech
|
||||||
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
tag, dependency parse, or lemma (lemmatization depends on the part-of-speech
|
||||||
tag).
|
tag).
|
||||||
"""
|
"""
|
||||||
def __init__(self, Vocab vocab, int orth):
|
def __init__(self, Vocab vocab, int orth):
|
||||||
"""Create a Lexeme object.
|
"""
|
||||||
|
Create a Lexeme object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab): The parent vocabulary
|
vocab (Vocab): The parent vocabulary
|
||||||
|
@ -80,7 +82,8 @@ cdef class Lexeme:
|
||||||
return self.c.orth
|
return self.c.orth
|
||||||
|
|
||||||
def set_flag(self, attr_id_t flag_id, bint value):
|
def set_flag(self, attr_id_t flag_id, bint value):
|
||||||
"""Change the value of a boolean flag.
|
"""
|
||||||
|
Change the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The attribute ID of the flag to set.
|
flag_id (int): The attribute ID of the flag to set.
|
||||||
|
@ -89,7 +92,8 @@ cdef class Lexeme:
|
||||||
Lexeme.c_set_flag(self.c, flag_id, value)
|
Lexeme.c_set_flag(self.c, flag_id, value)
|
||||||
|
|
||||||
def check_flag(self, attr_id_t flag_id):
|
def check_flag(self, attr_id_t flag_id):
|
||||||
"""Check the value of a boolean flag.
|
"""
|
||||||
|
Check the value of a boolean flag.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
flag_id (int): The attribute ID of the flag to query.
|
flag_id (int): The attribute ID of the flag to query.
|
||||||
|
@ -98,7 +102,8 @@ cdef class Lexeme:
|
||||||
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
return True if Lexeme.c_check_flag(self.c, flag_id) else False
|
||||||
|
|
||||||
def similarity(self, other):
|
def similarity(self, other):
|
||||||
'''Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
"""
|
||||||
|
Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
other:
|
other:
|
||||||
|
@ -106,7 +111,7 @@ cdef class Lexeme:
|
||||||
Token and Lexeme objects.
|
Token and Lexeme objects.
|
||||||
Returns:
|
Returns:
|
||||||
score (float): A scalar similarity score. Higher is more similar.
|
score (float): A scalar similarity score. Higher is more similar.
|
||||||
'''
|
"""
|
||||||
if self.vector_norm == 0 or other.vector_norm == 0:
|
if self.vector_norm == 0 or other.vector_norm == 0:
|
||||||
return 0.0
|
return 0.0
|
||||||
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
|
||||||
|
|
|
@ -180,7 +180,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab):
|
def load(cls, path, vocab):
|
||||||
'''Load the matcher and patterns from a file path.
|
"""
|
||||||
|
Load the matcher and patterns from a file path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -189,7 +190,7 @@ cdef class Matcher:
|
||||||
The vocabulary that the documents to match over will refer to.
|
The vocabulary that the documents to match over will refer to.
|
||||||
Returns:
|
Returns:
|
||||||
Matcher: The newly constructed object.
|
Matcher: The newly constructed object.
|
||||||
'''
|
"""
|
||||||
if (path / 'gazetteer.json').exists():
|
if (path / 'gazetteer.json').exists():
|
||||||
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
with (path / 'gazetteer.json').open('r', encoding='utf8') as file_:
|
||||||
patterns = json.load(file_)
|
patterns = json.load(file_)
|
||||||
|
@ -198,7 +199,8 @@ cdef class Matcher:
|
||||||
return cls(vocab, patterns)
|
return cls(vocab, patterns)
|
||||||
|
|
||||||
def __init__(self, vocab, patterns={}):
|
def __init__(self, vocab, patterns={}):
|
||||||
"""Create the Matcher.
|
"""
|
||||||
|
Create the Matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -227,7 +229,8 @@ cdef class Matcher:
|
||||||
|
|
||||||
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
def add_entity(self, entity_key, attrs=None, if_exists='raise',
|
||||||
acceptor=None, on_match=None):
|
acceptor=None, on_match=None):
|
||||||
"""Add an entity to the matcher.
|
"""
|
||||||
|
Add an entity to the matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int):
|
entity_key (unicode or int):
|
||||||
|
@ -264,7 +267,8 @@ cdef class Matcher:
|
||||||
self._callbacks[entity_key] = on_match
|
self._callbacks[entity_key] = on_match
|
||||||
|
|
||||||
def add_pattern(self, entity_key, token_specs, label=""):
|
def add_pattern(self, entity_key, token_specs, label=""):
|
||||||
"""Add a pattern to the matcher.
|
"""
|
||||||
|
Add a pattern to the matcher.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int):
|
entity_key (unicode or int):
|
||||||
|
@ -307,7 +311,8 @@ cdef class Matcher:
|
||||||
return entity_key
|
return entity_key
|
||||||
|
|
||||||
def has_entity(self, entity_key):
|
def has_entity(self, entity_key):
|
||||||
"""Check whether the matcher has an entity.
|
"""
|
||||||
|
Check whether the matcher has an entity.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (string or int): The entity key to check.
|
entity_key (string or int): The entity key to check.
|
||||||
|
@ -318,7 +323,8 @@ cdef class Matcher:
|
||||||
return entity_key in self._entities
|
return entity_key in self._entities
|
||||||
|
|
||||||
def get_entity(self, entity_key):
|
def get_entity(self, entity_key):
|
||||||
"""Retrieve the attributes stored for an entity.
|
"""
|
||||||
|
Retrieve the attributes stored for an entity.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
entity_key (unicode or int): The entity to retrieve.
|
entity_key (unicode or int): The entity to retrieve.
|
||||||
|
@ -332,7 +338,8 @@ cdef class Matcher:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def __call__(self, Doc doc, acceptor=None):
|
def __call__(self, Doc doc, acceptor=None):
|
||||||
"""Find all token sequences matching the supplied patterns on the Doc.
|
"""
|
||||||
|
Find all token sequences matching the supplied patterns on the Doc.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
@ -445,7 +452,8 @@ cdef class Matcher:
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
def pipe(self, docs, batch_size=1000, n_threads=2):
|
def pipe(self, docs, batch_size=1000, n_threads=2):
|
||||||
"""Match a stream of documents, yielding them in turn.
|
"""
|
||||||
|
Match a stream of documents, yielding them in turn.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
docs: A stream of documents.
|
docs: A stream of documents.
|
||||||
|
|
|
@ -16,7 +16,9 @@ from .attrs import LEMMA, intify_attrs
|
||||||
|
|
||||||
|
|
||||||
def _normalize_props(props):
|
def _normalize_props(props):
|
||||||
'''Transform deprecated string keys to correct names.'''
|
"""
|
||||||
|
Transform deprecated string keys to correct names.
|
||||||
|
"""
|
||||||
out = {}
|
out = {}
|
||||||
for key, value in props.items():
|
for key, value in props.items():
|
||||||
if key == POS:
|
if key == POS:
|
||||||
|
@ -98,13 +100,14 @@ cdef class Morphology:
|
||||||
flags[0] &= ~(one << flag_id)
|
flags[0] &= ~(one << flag_id)
|
||||||
|
|
||||||
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False):
|
||||||
'''Add a special-case rule to the morphological analyser. Tokens whose
|
"""
|
||||||
|
Add a special-case rule to the morphological analyser. Tokens whose
|
||||||
tag and orth match the rule will receive the specified properties.
|
tag and orth match the rule will receive the specified properties.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
tag (unicode): The part-of-speech tag to key the exception.
|
tag (unicode): The part-of-speech tag to key the exception.
|
||||||
orth (unicode): The word-form to key the exception.
|
orth (unicode): The word-form to key the exception.
|
||||||
'''
|
"""
|
||||||
tag = self.strings[tag_str]
|
tag = self.strings[tag_str]
|
||||||
tag_id = self.reverse_index[tag]
|
tag_id = self.reverse_index[tag]
|
||||||
orth = self.strings[orth_str]
|
orth = self.strings[orth_str]
|
||||||
|
|
|
@ -11,7 +11,9 @@ from .attrs import DEP, ENT_TYPE
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
"""Annotate named entities on Doc objects."""
|
"""
|
||||||
|
Annotate named entities on Doc objects.
|
||||||
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
feature_templates = get_feature_templates('ner')
|
feature_templates = get_feature_templates('ner')
|
||||||
|
@ -28,7 +30,9 @@ cdef class EntityRecognizer(Parser):
|
||||||
|
|
||||||
|
|
||||||
cdef class BeamEntityRecognizer(BeamParser):
|
cdef class BeamEntityRecognizer(BeamParser):
|
||||||
"""Annotate named entities on Doc objects."""
|
"""
|
||||||
|
Annotate named entities on Doc objects.
|
||||||
|
"""
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
feature_templates = get_feature_templates('ner')
|
feature_templates = get_feature_templates('ner')
|
||||||
|
|
|
@ -6,7 +6,9 @@ from .gold import tags_to_entities
|
||||||
|
|
||||||
|
|
||||||
class PRFScore(object):
|
class PRFScore(object):
|
||||||
"""A precision / recall / F score"""
|
"""
|
||||||
|
A precision / recall / F score
|
||||||
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.tp = 0
|
self.tp = 0
|
||||||
self.fp = 0
|
self.fp = 0
|
||||||
|
|
|
@ -73,13 +73,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex
|
||||||
|
|
||||||
|
|
||||||
cdef class StringStore:
|
cdef class StringStore:
|
||||||
'''Map strings to and from integer IDs.'''
|
"""
|
||||||
|
Map strings to and from integer IDs.
|
||||||
|
"""
|
||||||
def __init__(self, strings=None, freeze=False):
|
def __init__(self, strings=None, freeze=False):
|
||||||
'''Create the StringStore.
|
"""
|
||||||
|
Create the StringStore.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
strings: A sequence of unicode strings to add to the store.
|
strings: A sequence of unicode strings to add to the store.
|
||||||
'''
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._oov = PreshMap()
|
self._oov = PreshMap()
|
||||||
|
@ -104,7 +107,8 @@ cdef class StringStore:
|
||||||
return (StringStore, (list(self),))
|
return (StringStore, (list(self),))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The number of strings in the store.
|
"""
|
||||||
|
The number of strings in the store.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
int The number of strings in the store.
|
int The number of strings in the store.
|
||||||
|
@ -112,8 +116,9 @@ cdef class StringStore:
|
||||||
return self.size-1
|
return self.size-1
|
||||||
|
|
||||||
def __getitem__(self, object string_or_id):
|
def __getitem__(self, object string_or_id):
|
||||||
"""Retrieve a string from a given integer ID, or vice versa.
|
"""
|
||||||
|
Retrieve a string from a given integer ID, or vice versa.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string_or_id (bytes or unicode or int):
|
string_or_id (bytes or unicode or int):
|
||||||
The value to encode.
|
The value to encode.
|
||||||
|
@ -159,7 +164,8 @@ cdef class StringStore:
|
||||||
return utf8str - self.c
|
return utf8str - self.c
|
||||||
|
|
||||||
def __contains__(self, unicode string not None):
|
def __contains__(self, unicode string not None):
|
||||||
"""Check whether a string is in the store.
|
"""
|
||||||
|
Check whether a string is in the store.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to check.
|
string (unicode): The string to check.
|
||||||
|
@ -172,7 +178,8 @@ cdef class StringStore:
|
||||||
return self._map.get(key) is not NULL
|
return self._map.get(key) is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""Iterate over the strings in the store, in order.
|
"""
|
||||||
|
Iterate over the strings in the store, in order.
|
||||||
|
|
||||||
Yields: unicode A string in the store.
|
Yields: unicode A string in the store.
|
||||||
"""
|
"""
|
||||||
|
@ -230,7 +237,8 @@ cdef class StringStore:
|
||||||
return &self.c[self.size-1]
|
return &self.c[self.size-1]
|
||||||
|
|
||||||
def dump(self, file_):
|
def dump(self, file_):
|
||||||
"""Save the strings to a JSON file.
|
"""
|
||||||
|
Save the strings to a JSON file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file to save the strings.
|
file_ (buffer): The file to save the strings.
|
||||||
|
@ -244,7 +252,8 @@ cdef class StringStore:
|
||||||
file_.write(string_data)
|
file_.write(string_data)
|
||||||
|
|
||||||
def load(self, file_):
|
def load(self, file_):
|
||||||
"""Load the strings from a JSON file.
|
"""
|
||||||
|
Load the strings from a JSON file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file from which to load the strings.
|
file_ (buffer): The file from which to load the strings.
|
||||||
|
|
|
@ -106,10 +106,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
"""Annotate part-of-speech tags on Doc objects."""
|
"""
|
||||||
|
Annotate part-of-speech tags on Doc objects.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, vocab, require=False):
|
def load(cls, path, vocab, require=False):
|
||||||
"""Load the statistical model from the supplied path.
|
"""
|
||||||
|
Load the statistical model from the supplied path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
|
@ -142,7 +145,8 @@ cdef class Tagger:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
|
||||||
"""Create a Tagger.
|
"""
|
||||||
|
Create a Tagger.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
|
@ -180,7 +184,8 @@ cdef class Tagger:
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def __call__(self, Doc tokens):
|
def __call__(self, Doc tokens):
|
||||||
"""Apply the tagger, setting the POS tags onto the Doc object.
|
"""
|
||||||
|
Apply the tagger, setting the POS tags onto the Doc object.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc): The tokens to be tagged.
|
doc (Doc): The tokens to be tagged.
|
||||||
|
@ -208,7 +213,8 @@ cdef class Tagger:
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=1000, n_threads=2):
|
def pipe(self, stream, batch_size=1000, n_threads=2):
|
||||||
"""Tag a stream of documents.
|
"""
|
||||||
|
Tag a stream of documents.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
stream: The sequence of documents to tag.
|
stream: The sequence of documents to tag.
|
||||||
|
@ -225,7 +231,8 @@ cdef class Tagger:
|
||||||
yield doc
|
yield doc
|
||||||
|
|
||||||
def update(self, Doc tokens, GoldParse gold, itn=0):
|
def update(self, Doc tokens, GoldParse gold, itn=0):
|
||||||
"""Update the statistical model, with tags supplied for the given document.
|
"""
|
||||||
|
Update the statistical model, with tags supplied for the given document.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
doc (Doc):
|
doc (Doc):
|
||||||
|
|
|
@ -23,12 +23,15 @@ from .tokens.doc cimport Doc
|
||||||
|
|
||||||
|
|
||||||
cdef class Tokenizer:
|
cdef class Tokenizer:
|
||||||
"""Segment text, and create Doc objects with the discovered segment boundaries."""
|
"""
|
||||||
|
Segment text, and create Doc objects with the discovered segment boundaries.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
|
||||||
infix_finditer=None, token_match=None):
|
infix_finditer=None, token_match=None):
|
||||||
'''Load a Tokenizer, reading unsupplied components from the path.
|
"""
|
||||||
|
Load a Tokenizer, reading unsupplied components from the path.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
path (Path):
|
path (Path):
|
||||||
The path to load from.
|
The path to load from.
|
||||||
|
@ -45,10 +48,10 @@ cdef class Tokenizer:
|
||||||
infix_finditer:
|
infix_finditer:
|
||||||
Signature of re.compile(string).finditer
|
Signature of re.compile(string).finditer
|
||||||
Returns Tokenizer
|
Returns Tokenizer
|
||||||
'''
|
|
||||||
if isinstance(path, basestring):
|
if isinstance(path, basestring):
|
||||||
path = pathlib.Path(path)
|
path = pathlib.Path(path)
|
||||||
|
|
||||||
|
"""
|
||||||
if rules is None:
|
if rules is None:
|
||||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||||
rules = json.load(file_)
|
rules = json.load(file_)
|
||||||
|
@ -67,8 +70,9 @@ cdef class Tokenizer:
|
||||||
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
|
||||||
'''Create a Tokenizer, to create Doc objects given unicode text.
|
"""
|
||||||
|
Create a Tokenizer, to create Doc objects given unicode text.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
vocab (Vocab):
|
vocab (Vocab):
|
||||||
A storage container for lexical types.
|
A storage container for lexical types.
|
||||||
|
@ -85,7 +89,7 @@ cdef class Tokenizer:
|
||||||
to find infixes.
|
to find infixes.
|
||||||
token_match:
|
token_match:
|
||||||
A boolean function matching strings that becomes tokens.
|
A boolean function matching strings that becomes tokens.
|
||||||
'''
|
"""
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._cache = PreshMap()
|
self._cache = PreshMap()
|
||||||
self._specials = PreshMap()
|
self._specials = PreshMap()
|
||||||
|
@ -117,7 +121,8 @@ cdef class Tokenizer:
|
||||||
|
|
||||||
@cython.boundscheck(False)
|
@cython.boundscheck(False)
|
||||||
def __call__(self, unicode string):
|
def __call__(self, unicode string):
|
||||||
"""Tokenize a string.
|
"""
|
||||||
|
Tokenize a string.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to tokenize.
|
string (unicode): The string to tokenize.
|
||||||
|
@ -170,7 +175,8 @@ cdef class Tokenizer:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def pipe(self, texts, batch_size=1000, n_threads=2):
|
def pipe(self, texts, batch_size=1000, n_threads=2):
|
||||||
"""Tokenize a stream of texts.
|
"""
|
||||||
|
Tokenize a stream of texts.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
texts: A sequence of unicode texts.
|
texts: A sequence of unicode texts.
|
||||||
|
@ -324,7 +330,8 @@ cdef class Tokenizer:
|
||||||
self._cache.set(key, cached)
|
self._cache.set(key, cached)
|
||||||
|
|
||||||
def find_infix(self, unicode string):
|
def find_infix(self, unicode string):
|
||||||
"""Find internal split points of the string, such as hyphens.
|
"""
|
||||||
|
Find internal split points of the string, such as hyphens.
|
||||||
|
|
||||||
string (unicode): The string to segment.
|
string (unicode): The string to segment.
|
||||||
|
|
||||||
|
@ -337,7 +344,8 @@ cdef class Tokenizer:
|
||||||
return list(self.infix_finditer(string))
|
return list(self.infix_finditer(string))
|
||||||
|
|
||||||
def find_prefix(self, unicode string):
|
def find_prefix(self, unicode string):
|
||||||
"""Find the length of a prefix that should be segmented from the string,
|
"""
|
||||||
|
Find the length of a prefix that should be segmented from the string,
|
||||||
or None if no prefix rules match.
|
or None if no prefix rules match.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -350,7 +358,8 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def find_suffix(self, unicode string):
|
def find_suffix(self, unicode string):
|
||||||
"""Find the length of a suffix that should be segmented from the string,
|
"""
|
||||||
|
Find the length of a suffix that should be segmented from the string,
|
||||||
or None if no suffix rules match.
|
or None if no suffix rules match.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -363,13 +372,15 @@ cdef class Tokenizer:
|
||||||
return (match.end() - match.start()) if match is not None else 0
|
return (match.end() - match.start()) if match is not None else 0
|
||||||
|
|
||||||
def _load_special_tokenization(self, special_cases):
|
def _load_special_tokenization(self, special_cases):
|
||||||
'''Add special-case tokenization rules.
|
"""
|
||||||
'''
|
Add special-case tokenization rules.
|
||||||
|
"""
|
||||||
for chunk, substrings in sorted(special_cases.items()):
|
for chunk, substrings in sorted(special_cases.items()):
|
||||||
self.add_special_case(chunk, substrings)
|
self.add_special_case(chunk, substrings)
|
||||||
|
|
||||||
def add_special_case(self, unicode string, substrings):
|
def add_special_case(self, unicode string, substrings):
|
||||||
'''Add a special-case tokenization rule.
|
"""
|
||||||
|
Add a special-case tokenization rule.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The string to specially tokenize.
|
string (unicode): The string to specially tokenize.
|
||||||
|
@ -378,7 +389,7 @@ cdef class Tokenizer:
|
||||||
attributes. The ORTH fields of the attributes must exactly match
|
attributes. The ORTH fields of the attributes must exactly match
|
||||||
the string when they are concatenated.
|
the string when they are concatenated.
|
||||||
Returns None
|
Returns None
|
||||||
'''
|
"""
|
||||||
substrings = list(substrings)
|
substrings = list(substrings)
|
||||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||||
cached.length = len(substrings)
|
cached.length = len(substrings)
|
||||||
|
|
|
@ -9,7 +9,9 @@ from .gold import merge_sents
|
||||||
|
|
||||||
|
|
||||||
class Trainer(object):
|
class Trainer(object):
|
||||||
'''Manage training of an NLP pipeline.'''
|
"""
|
||||||
|
Manage training of an NLP pipeline.
|
||||||
|
"""
|
||||||
def __init__(self, nlp, gold_tuples):
|
def __init__(self, nlp, gold_tuples):
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.gold_tuples = gold_tuples
|
self.gold_tuples = gold_tuples
|
||||||
|
|
|
@ -48,8 +48,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC
|
||||||
|
|
||||||
|
|
||||||
cdef class Vocab:
|
cdef class Vocab:
|
||||||
'''A map container for a language's LexemeC structs.
|
"""
|
||||||
'''
|
A map container for a language's LexemeC structs.
|
||||||
|
"""
|
||||||
@classmethod
|
@classmethod
|
||||||
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
def load(cls, path, lex_attr_getters=None, lemmatizer=True,
|
||||||
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs):
|
||||||
|
@ -108,7 +109,8 @@ cdef class Vocab:
|
||||||
|
|
||||||
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
|
||||||
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
serializer_freqs=None, strings=tuple(), **deprecated_kwargs):
|
||||||
'''Create the vocabulary.
|
"""
|
||||||
|
Create the vocabulary.
|
||||||
|
|
||||||
lex_attr_getters (dict):
|
lex_attr_getters (dict):
|
||||||
A dictionary mapping attribute IDs to functions to compute them.
|
A dictionary mapping attribute IDs to functions to compute them.
|
||||||
|
@ -123,7 +125,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Vocab: The newly constructed vocab object.
|
Vocab: The newly constructed vocab object.
|
||||||
'''
|
"""
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
|
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
|
@ -172,17 +174,19 @@ cdef class Vocab:
|
||||||
return langfunc('_') if langfunc else ''
|
return langfunc('_') if langfunc else ''
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""
|
||||||
|
The current number of lexemes stored.
|
||||||
|
"""
|
||||||
return self.length
|
return self.length
|
||||||
|
|
||||||
def resize_vectors(self, int new_size):
|
def resize_vectors(self, int new_size):
|
||||||
'''
|
"""
|
||||||
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
Set vectors_length to a new size, and allocate more memory for the Lexeme
|
||||||
vectors if necessary. The memory will be zeroed.
|
vectors if necessary. The memory will be zeroed.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
new_size (int): The new size of the vectors.
|
new_size (int): The new size of the vectors.
|
||||||
'''
|
"""
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
if new_size > self.vectors_length:
|
if new_size > self.vectors_length:
|
||||||
|
@ -193,7 +197,8 @@ cdef class Vocab:
|
||||||
self.vectors_length = new_size
|
self.vectors_length = new_size
|
||||||
|
|
||||||
def add_flag(self, flag_getter, int flag_id=-1):
|
def add_flag(self, flag_getter, int flag_id=-1):
|
||||||
'''Set a new boolean flag to words in the vocabulary.
|
"""
|
||||||
|
Set a new boolean flag to words in the vocabulary.
|
||||||
|
|
||||||
The flag_setter function will be called over the words currently in the
|
The flag_setter function will be called over the words currently in the
|
||||||
vocab, and then applied to new words as they occur. You'll then be able
|
vocab, and then applied to new words as they occur. You'll then be able
|
||||||
|
@ -213,7 +218,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
flag_id (int): The integer ID by which the flag value can be checked.
|
flag_id (int): The integer ID by which the flag value can be checked.
|
||||||
'''
|
"""
|
||||||
if flag_id == -1:
|
if flag_id == -1:
|
||||||
for bit in range(1, 64):
|
for bit in range(1, 64):
|
||||||
if bit not in self.lex_attr_getters:
|
if bit not in self.lex_attr_getters:
|
||||||
|
@ -234,9 +239,11 @@ cdef class Vocab:
|
||||||
return flag_id
|
return flag_id
|
||||||
|
|
||||||
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
cdef const LexemeC* get(self, Pool mem, unicode string) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
"""
|
||||||
|
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
"""
|
||||||
if string == u'':
|
if string == u'':
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
|
@ -252,9 +259,11 @@ cdef class Vocab:
|
||||||
return self._new_lexeme(mem, string)
|
return self._new_lexeme(mem, string)
|
||||||
|
|
||||||
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL:
|
||||||
'''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
"""
|
||||||
|
Get a pointer to a LexemeC from the lexicon, creating a new Lexeme
|
||||||
if necessary, using memory acquired from the given pool. If the pool
|
if necessary, using memory acquired from the given pool. If the pool
|
||||||
is the lexicon's own memory, the lexeme is saved in the lexicon.'''
|
is the lexicon's own memory, the lexeme is saved in the lexicon.
|
||||||
|
"""
|
||||||
if orth == 0:
|
if orth == 0:
|
||||||
return &EMPTY_LEXEME
|
return &EMPTY_LEXEME
|
||||||
cdef LexemeC* lex
|
cdef LexemeC* lex
|
||||||
|
@ -297,30 +306,33 @@ cdef class Vocab:
|
||||||
self.length += 1
|
self.length += 1
|
||||||
|
|
||||||
def __contains__(self, unicode string):
|
def __contains__(self, unicode string):
|
||||||
'''Check whether the string has an entry in the vocabulary.
|
"""
|
||||||
|
Check whether the string has an entry in the vocabulary.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
string (unicode): The ID string.
|
string (unicode): The ID string.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
bool Whether the string has an entry in the vocabulary.
|
bool Whether the string has an entry in the vocabulary.
|
||||||
'''
|
"""
|
||||||
key = hash_string(string)
|
key = hash_string(string)
|
||||||
lex = self._by_hash.get(key)
|
lex = self._by_hash.get(key)
|
||||||
return lex is not NULL
|
return lex is not NULL
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
'''Iterate over the lexemes in the vocabulary.
|
"""
|
||||||
|
Iterate over the lexemes in the vocabulary.
|
||||||
|
|
||||||
Yields: Lexeme An entry in the vocabulary.
|
Yields: Lexeme An entry in the vocabulary.
|
||||||
'''
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
cdef size_t addr
|
cdef size_t addr
|
||||||
for orth, addr in self._by_orth.items():
|
for orth, addr in self._by_orth.items():
|
||||||
yield Lexeme(self, orth)
|
yield Lexeme(self, orth)
|
||||||
|
|
||||||
def __getitem__(self, id_or_string):
|
def __getitem__(self, id_or_string):
|
||||||
'''Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
"""
|
||||||
|
Retrieve a lexeme, given an int ID or a unicode string. If a previously
|
||||||
unseen unicode string is given, a new lexeme is created and stored.
|
unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -332,7 +344,7 @@ cdef class Vocab:
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): The lexeme indicated by the given ID.
|
lexeme (Lexeme): The lexeme indicated by the given ID.
|
||||||
'''
|
"""
|
||||||
cdef attr_t orth
|
cdef attr_t orth
|
||||||
if type(id_or_string) == unicode:
|
if type(id_or_string) == unicode:
|
||||||
orth = self.strings[id_or_string]
|
orth = self.strings[id_or_string]
|
||||||
|
@ -355,7 +367,8 @@ cdef class Vocab:
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
def dump(self, loc=None):
|
def dump(self, loc=None):
|
||||||
"""Save the lexemes binary data to the given location, or
|
"""
|
||||||
|
Save the lexemes binary data to the given location, or
|
||||||
return a byte-string with the data if loc is None.
|
return a byte-string with the data if loc is None.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
|
@ -392,14 +405,15 @@ cdef class Vocab:
|
||||||
return fp.string_data()
|
return fp.string_data()
|
||||||
|
|
||||||
def load_lexemes(self, loc):
|
def load_lexemes(self, loc):
|
||||||
'''Load the binary vocabulary data from the given location.
|
"""
|
||||||
|
Load the binary vocabulary data from the given location.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (Path): The path to load from.
|
loc (Path): The path to load from.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
'''
|
"""
|
||||||
fp = CFile(loc, 'rb',
|
fp = CFile(loc, 'rb',
|
||||||
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
|
||||||
cdef LexemeC* lexeme = NULL
|
cdef LexemeC* lexeme = NULL
|
||||||
|
@ -440,8 +454,9 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def _deserialize_lexemes(self, CFile fp):
|
def _deserialize_lexemes(self, CFile fp):
|
||||||
'''Load the binary vocabulary data from the given CFile.
|
"""
|
||||||
'''
|
Load the binary vocabulary data from the given CFile.
|
||||||
|
"""
|
||||||
cdef LexemeC* lexeme = NULL
|
cdef LexemeC* lexeme = NULL
|
||||||
cdef hash_t key
|
cdef hash_t key
|
||||||
cdef unicode py_str
|
cdef unicode py_str
|
||||||
|
@ -494,13 +509,14 @@ cdef class Vocab:
|
||||||
fp.close()
|
fp.close()
|
||||||
|
|
||||||
def dump_vectors(self, out_loc):
|
def dump_vectors(self, out_loc):
|
||||||
'''Save the word vectors to a binary file.
|
"""
|
||||||
|
Save the word vectors to a binary file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (Path): The path to save to.
|
loc (Path): The path to save to.
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
'''
|
"""
|
||||||
cdef int32_t vec_len = self.vectors_length
|
cdef int32_t vec_len = self.vectors_length
|
||||||
cdef int32_t word_len
|
cdef int32_t word_len
|
||||||
cdef bytes word_str
|
cdef bytes word_str
|
||||||
|
@ -522,7 +538,8 @@ cdef class Vocab:
|
||||||
out_file.close()
|
out_file.close()
|
||||||
|
|
||||||
def load_vectors(self, file_):
|
def load_vectors(self, file_):
|
||||||
"""Load vectors from a text-based file.
|
"""
|
||||||
|
Load vectors from a text-based file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
file_ (buffer): The file to read from. Entries should be separated by newlines,
|
||||||
|
@ -561,7 +578,8 @@ cdef class Vocab:
|
||||||
return vec_len
|
return vec_len
|
||||||
|
|
||||||
def load_vectors_from_bin_loc(self, loc):
|
def load_vectors_from_bin_loc(self, loc):
|
||||||
"""Load vectors from the location of a binary file.
|
"""
|
||||||
|
Load vectors from the location of a binary file.
|
||||||
|
|
||||||
Arguments:
|
Arguments:
|
||||||
loc (unicode): The path of the binary file to load from.
|
loc (unicode): The path of the binary file to load from.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user