mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Add docstrings, error messages and fix consistency
This commit is contained in:
parent
ee7dcf65c9
commit
1694c24e52
138
spacy/util.py
138
spacy/util.py
|
@ -32,11 +32,25 @@ def get_lang_class(name):
|
||||||
|
|
||||||
|
|
||||||
def load_lang_class(lang):
|
def load_lang_class(lang):
|
||||||
|
"""Import and load a Language class.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||||
|
Returns:
|
||||||
|
Language: Language class.
|
||||||
|
"""
|
||||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||||
return getattr(module, module.__all__[0])
|
return getattr(module, module.__all__[0])
|
||||||
|
|
||||||
|
|
||||||
def get_data_path(require_exists=True):
|
def get_data_path(require_exists=True):
|
||||||
|
"""Get path to spaCy data directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
require_exists (bool): Only return path if it exists, otherwise None.
|
||||||
|
Returns:
|
||||||
|
Path or None: Data path or None.
|
||||||
|
"""
|
||||||
if not require_exists:
|
if not require_exists:
|
||||||
return _data_path
|
return _data_path
|
||||||
else:
|
else:
|
||||||
|
@ -44,6 +58,11 @@ def get_data_path(require_exists=True):
|
||||||
|
|
||||||
|
|
||||||
def set_data_path(path):
|
def set_data_path(path):
|
||||||
|
"""Set path to spaCy data directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (unicode or Path): Path to new data directory.
|
||||||
|
"""
|
||||||
global _data_path
|
global _data_path
|
||||||
_data_path = ensure_path(path)
|
_data_path = ensure_path(path)
|
||||||
|
|
||||||
|
@ -56,6 +75,13 @@ def ensure_path(path):
|
||||||
|
|
||||||
|
|
||||||
def resolve_model_path(name):
|
def resolve_model_path(name):
|
||||||
|
"""Resolve a model name or string to a model path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name (unicode): Package name, shortcut link or model path.
|
||||||
|
Returns:
|
||||||
|
Path: Path to model data directory.
|
||||||
|
"""
|
||||||
data_path = get_data_path()
|
data_path = get_data_path()
|
||||||
if not data_path or not data_path.exists():
|
if not data_path or not data_path.exists():
|
||||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||||
|
@ -71,18 +97,30 @@ def resolve_model_path(name):
|
||||||
raise IOError("Can't find model '%s'" % name)
|
raise IOError("Can't find model '%s'" % name)
|
||||||
|
|
||||||
|
|
||||||
def is_package(origin):
|
def is_package(name):
|
||||||
"""
|
"""Check if string maps to a package installed via pip.
|
||||||
Check if string maps to a package installed via pip.
|
|
||||||
|
Args:
|
||||||
|
name (unicode): Name of package.
|
||||||
|
Returns:
|
||||||
|
bool: True if installed package, False if not.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
packages = pip.get_installed_distributions()
|
packages = pip.get_installed_distributions()
|
||||||
for package in packages:
|
for package in packages:
|
||||||
if package.project_name.replace('-', '_') == origin:
|
if package.project_name.replace('-', '_') == name:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_model_package_path(package_name):
|
def get_model_package_path(package_name):
|
||||||
|
"""Get path to a model package installed via pip.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
package_name (unicode): Name of installed package.
|
||||||
|
Returns:
|
||||||
|
Path: Path to model data directory.
|
||||||
|
"""
|
||||||
# Here we're importing the module just to find it. This is worryingly
|
# Here we're importing the module just to find it. This is worryingly
|
||||||
# indirect, but it's otherwise very difficult to find the package.
|
# indirect, but it's otherwise very difficult to find the package.
|
||||||
# Python's installation and import rules are very complicated.
|
# Python's installation and import rules are very complicated.
|
||||||
|
@ -94,9 +132,13 @@ def get_model_package_path(package_name):
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, require=True):
|
def parse_package_meta(package_path, require=True):
|
||||||
"""
|
"""Check if a meta.json exists in a package and return its contents.
|
||||||
Check if a meta.json exists in a package and return its contents as a
|
|
||||||
dictionary. If require is set to True, raise an error if no meta.json found.
|
Args:
|
||||||
|
package_path (Path): Path to model package directory.
|
||||||
|
require (bool): If True, raise error if no meta.json is found.
|
||||||
|
Returns:
|
||||||
|
dict or None: Model meta.json data or None.
|
||||||
"""
|
"""
|
||||||
location = package_path / 'meta.json'
|
location = package_path / 'meta.json'
|
||||||
if location.is_file():
|
if location.is_file():
|
||||||
|
@ -136,6 +178,14 @@ def compile_infix_regex(entries):
|
||||||
|
|
||||||
|
|
||||||
def update_exc(base_exceptions, *addition_dicts):
|
def update_exc(base_exceptions, *addition_dicts):
|
||||||
|
"""Update and validate tokenizer exceptions. Will overwrite exceptions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
base_exceptions (dict): Base exceptions.
|
||||||
|
*addition_dicts (dict): Exceptions to add to the base dict, in order.
|
||||||
|
Returns:
|
||||||
|
dict: Combined tokenizer exceptions.
|
||||||
|
"""
|
||||||
exc = dict(base_exceptions)
|
exc = dict(base_exceptions)
|
||||||
for additions in addition_dicts:
|
for additions in addition_dicts:
|
||||||
for orth, token_attrs in additions.items():
|
for orth, token_attrs in additions.items():
|
||||||
|
@ -144,9 +194,9 @@ def update_exc(base_exceptions, *addition_dicts):
|
||||||
raise ValueError(msg % (orth, token_attrs))
|
raise ValueError(msg % (orth, token_attrs))
|
||||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||||
if orth != described_orth:
|
if orth != described_orth:
|
||||||
# TODO: Better error
|
raise ValueError("Invalid tokenizer exception: ORTH values "
|
||||||
msg = "Invalid tokenizer exception: key='%s', orths='%s'"
|
"combined don't match original string. "
|
||||||
raise ValueError(msg % (orth, described_orth))
|
"key='%s', orths='%s'" % (orth, described_orth))
|
||||||
# overlap = set(exc.keys()).intersection(set(additions))
|
# overlap = set(exc.keys()).intersection(set(additions))
|
||||||
# assert not overlap, overlap
|
# assert not overlap, overlap
|
||||||
exc.update(additions)
|
exc.update(additions)
|
||||||
|
@ -155,6 +205,16 @@ def update_exc(base_exceptions, *addition_dicts):
|
||||||
|
|
||||||
|
|
||||||
def expand_exc(excs, search, replace):
|
def expand_exc(excs, search, replace):
|
||||||
|
"""Find string in tokenizer exceptions, duplicate entry and replace string.
|
||||||
|
For example, to add additional versions with typographic apostrophes.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
excs (dict): Tokenizer exceptions.
|
||||||
|
search (unicode): String to find and replace.
|
||||||
|
replace (unicode): Replacement.
|
||||||
|
Returns:
|
||||||
|
dict:
|
||||||
|
"""
|
||||||
def _fix_token(token, search, replace):
|
def _fix_token(token, search, replace):
|
||||||
fixed = dict(token)
|
fixed = dict(token)
|
||||||
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
fixed[ORTH] = fixed[ORTH].replace(search, replace)
|
||||||
|
@ -195,14 +255,25 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
|
|
||||||
|
|
||||||
def read_json(location):
|
def read_json(location):
|
||||||
|
"""Open and load JSON from file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
location (Path): Path to JSON file.
|
||||||
|
Returns:
|
||||||
|
dict: Loaded JSON content.
|
||||||
|
"""
|
||||||
with location.open('r', encoding='utf8') as f:
|
with location.open('r', encoding='utf8') as f:
|
||||||
return ujson.load(f)
|
return ujson.load(f)
|
||||||
|
|
||||||
|
|
||||||
def get_raw_input(description, default=False):
|
def get_raw_input(description, default=False):
|
||||||
"""
|
"""Get user input from the command line via raw_input / input.
|
||||||
Get user input via raw_input / input and return input value. Takes a
|
|
||||||
description, and an optional default value to display with the prompt.
|
Args:
|
||||||
|
description (unicode): Text to display before prompt.
|
||||||
|
default (unicode or False/None): Default value to display with prompt.
|
||||||
|
Returns:
|
||||||
|
unicode: User input.
|
||||||
"""
|
"""
|
||||||
additional = ' (default: %s)' % default if default else ''
|
additional = ' (default: %s)' % default if default else ''
|
||||||
prompt = ' %s%s: ' % (description, additional)
|
prompt = ' %s%s: ' % (description, additional)
|
||||||
|
@ -211,11 +282,13 @@ def get_raw_input(description, default=False):
|
||||||
|
|
||||||
|
|
||||||
def print_table(data, title=None):
|
def print_table(data, title=None):
|
||||||
|
"""Print data in table format.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (dict or list of tuples): Label/value pairs.
|
||||||
|
title (unicode or None): Title, will be printed above.
|
||||||
"""
|
"""
|
||||||
Print data in table format. Can either take a list of tuples or a
|
if isinstance(data, dict):
|
||||||
dictionary, which will be converted to a list of tuples.
|
|
||||||
"""
|
|
||||||
if type(data) == dict:
|
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
tpl_row = ' {:<15}' * len(data[0])
|
tpl_row = ' {:<15}' * len(data[0])
|
||||||
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
||||||
|
@ -225,14 +298,16 @@ def print_table(data, title=None):
|
||||||
|
|
||||||
|
|
||||||
def print_markdown(data, title=None):
|
def print_markdown(data, title=None):
|
||||||
"""
|
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||||
Print listed data in GitHub-flavoured Markdown format so it can be
|
|
||||||
copy-pasted into issues. Can either take a list of tuples or a dictionary.
|
Args:
|
||||||
|
data (dict or list of tuples): Label/value pairs.
|
||||||
|
title (unicode or None): Title, will be rendered as headline 2.
|
||||||
"""
|
"""
|
||||||
def excl_value(value):
|
def excl_value(value):
|
||||||
return Path(value).exists() # contains path (personal info)
|
return Path(value).exists() # contains path (personal info)
|
||||||
|
|
||||||
if type(data) == dict:
|
if isinstance(data, dict):
|
||||||
data = list(data.items())
|
data = list(data.items())
|
||||||
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
|
markdown = ["* **{}:** {}".format(l, v) for l, v in data if not excl_value(v)]
|
||||||
if title:
|
if title:
|
||||||
|
@ -241,10 +316,12 @@ def print_markdown(data, title=None):
|
||||||
|
|
||||||
|
|
||||||
def prints(*texts, **kwargs):
|
def prints(*texts, **kwargs):
|
||||||
"""
|
"""Print formatted message (manual ANSI escape sequences to avoid dependency)
|
||||||
Print formatted message. Each positional argument is rendered as newline-
|
|
||||||
separated paragraph. An optional highlighted title is printed above the text
|
Args:
|
||||||
(using ANSI escape sequences manually to avoid unnecessary dependency).
|
*texts (unicode): Texts to print. Each argument is rendered as paragraph.
|
||||||
|
**kwargs: 'title' is rendered as coloured headline. 'exits'=True performs
|
||||||
|
system exit after printing.
|
||||||
"""
|
"""
|
||||||
exits = kwargs.get('exits', False)
|
exits = kwargs.get('exits', False)
|
||||||
title = kwargs.get('title', None)
|
title = kwargs.get('title', None)
|
||||||
|
@ -256,9 +333,14 @@ def prints(*texts, **kwargs):
|
||||||
|
|
||||||
|
|
||||||
def _wrap(text, wrap_max=80, indent=4):
|
def _wrap(text, wrap_max=80, indent=4):
|
||||||
"""
|
"""Wrap text at given width using textwrap module.
|
||||||
Wrap text at given width using textwrap module. Indent should consist of
|
|
||||||
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
Args:
|
||||||
|
text (unicode): Text to wrap. If it's a Path, it's converted to string.
|
||||||
|
wrap_max (int): Maximum line length (indent is deducted).
|
||||||
|
indent (int): Number of spaces for indentation.
|
||||||
|
Returns:
|
||||||
|
unicode: Wrapped text.
|
||||||
"""
|
"""
|
||||||
indent = indent * ' '
|
indent = indent * ' '
|
||||||
wrap_width = wrap_max - len(indent)
|
wrap_width = wrap_max - len(indent)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user