spaCy/spacy/util.py
2017-05-14 00:37:53 +02:00

231 lines
6.7 KiB
Python

# coding: utf8
from __future__ import unicode_literals, print_function
import ujson
import regex as re
from pathlib import Path
import sys
import textwrap
from .compat import basestring_, unicode_, input_
LANGUAGES = {}
_data_path = Path(__file__).parent / 'data'
def set_lang_class(name, cls):
global LANGUAGES
LANGUAGES[name] = cls
def get_lang_class(name):
if name in LANGUAGES:
return LANGUAGES[name]
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
if lang not in LANGUAGES:
raise RuntimeError('Language not supported: %s' % name)
return LANGUAGES[lang]
def get_data_path(require_exists=True):
if not require_exists:
return _data_path
else:
return _data_path if _data_path.exists() else None
def set_data_path(path):
global _data_path
_data_path = ensure_path(path)
def ensure_path(path):
if isinstance(path, basestring_):
return Path(path)
else:
return path
def get_cuda_stream(require=False):
# TODO: Error and tell to install chainer if not found
# Requires GPU
try:
from cupy.cuda.stream import Stream
except ImportError:
return None
return Stream()
def read_regex(path):
path = ensure_path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
def compile_prefix_regex(entries):
if '(' in entries:
# Handle deprecated data
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
else:
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
return re.compile(expression)
def compile_suffix_regex(entries):
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return re.compile(expression)
def compile_infix_regex(entries):
expression = '|'.join([piece for piece in entries if piece.strip()])
return re.compile(expression)
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
if start is None:
start = 0
elif start < 0:
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
def check_renamed_kwargs(renamed, kwargs):
for old, new in renamed.items():
if old in kwargs:
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
def read_json(location):
with location.open('r', encoding='utf8') as f:
return ujson.load(f)
def parse_package_meta(package_path, package, require=True):
"""
Check if a meta.json exists in a package and return its contents as a
dictionary. If require is set to True, raise an error if no meta.json found.
"""
# TODO: Allow passing in full model path and only require one argument
# instead of path and package name. This lets us avoid passing in an awkward
# empty string in spacy.load() if user supplies full model path.
location = package_path / package / 'meta.json'
if location.is_file():
return read_json(location)
elif require:
raise IOError("Could not read meta.json from %s" % location)
else:
return None
def get_raw_input(description, default=False):
"""
Get user input via raw_input / input and return input value. Takes a
description for the prompt, and an optional default value that's displayed
with the prompt.
"""
additional = ' (default: {d})'.format(d=default) if default else ''
prompt = ' {d}{a}: '.format(d=description, a=additional)
user_input = input_(prompt)
return user_input
def print_table(data, **kwargs):
"""
Print data in table format. Can either take a list of tuples or a
dictionary, which will be converted to a list of tuples.
"""
if type(data) == dict:
data = list(data.items())
tpl_msg = '\n{msg}\n'
tpl_title = '\n \033[93m{msg}\033[0m'
tpl_row =" {:<15}" * len(data[0])
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
if 'title' in kwargs and kwargs['title']:
print(tpl_title.format(msg=kwargs['title']))
print(tpl_msg.format(msg=table))
def print_markdown(data, **kwargs):
"""
Print listed data in GitHub-flavoured Markdown format so it can be
copy-pasted into issues. Can either take a list of tuples or a dictionary,
which will be converted to a list of tuples.
"""
def excl_value(value):
# don't print value if it contains absolute path of directory (i.e.
# personal info). Other conditions can be included here if necessary.
if unicode_(Path(__file__).parent) in value:
return True
if type(data) == dict:
data = list(data.items())
tpl_msg = "\n{msg}\n"
tpl_title = "\n## {msg}"
tpl_row = "* **{l}:** {v}"
markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])
if 'title' in kwargs and kwargs['title']:
print(tpl_title.format(msg=kwargs['title']))
print(tpl_msg.format(msg=markdown))
def print_msg(*text, **kwargs):
"""
Print formatted message. Each positional argument is rendered as newline-
separated paragraph. If kwarg 'title' exist, title is printed above the text
and highlighted (using ANSI escape sequences manually to avoid unnecessary
dependency).
"""
message = '\n\n'.join([_wrap_text(t) for t in text])
tpl_msg = '\n{msg}\n'
tpl_title = '\n\033[93m{msg}\033[0m'
if 'title' in kwargs and kwargs['title']:
title = _wrap_text(kwargs['title'])
print(tpl_title.format(msg=title))
print(tpl_msg.format(msg=message))
def _wrap_text(text):
"""
Wrap text at given width using textwrap module. Indent should consist of
spaces. Its length is deducted from wrap width to ensure exact wrapping.
"""
wrap_max = 80
indent = ' '
wrap_width = wrap_max - len(indent)
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
subsequent_indent=indent, break_long_words=False,
break_on_hyphens=False)
def sys_exit(*messages, **kwargs):
"""
Performs SystemExit. For modules used from the command line, like
download and link. To print message, use the same arguments as for
print_msg().
"""
if messages:
print_msg(*messages, **kwargs)
sys.exit(0)