mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 20:16:23 +03:00
221 lines
6.5 KiB
Python
221 lines
6.5 KiB
Python
# coding: utf8
|
|
from __future__ import unicode_literals, print_function
|
|
|
|
import ujson
|
|
import regex as re
|
|
from pathlib import Path
|
|
import sys
|
|
import textwrap
|
|
|
|
from .compat import basestring_, unicode_, input_
|
|
|
|
|
|
LANGUAGES = {}
|
|
_data_path = Path(__file__).parent / 'data'
|
|
|
|
|
|
def set_lang_class(name, cls):
|
|
global LANGUAGES
|
|
LANGUAGES[name] = cls
|
|
|
|
|
|
def get_lang_class(name):
|
|
if name in LANGUAGES:
|
|
return LANGUAGES[name]
|
|
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
|
|
if lang not in LANGUAGES:
|
|
raise RuntimeError('Language not supported: %s' % name)
|
|
return LANGUAGES[lang]
|
|
|
|
|
|
def get_data_path(require_exists=True):
|
|
if not require_exists:
|
|
return _data_path
|
|
else:
|
|
return _data_path if _data_path.exists() else None
|
|
|
|
|
|
def set_data_path(path):
|
|
global _data_path
|
|
_data_path = ensure_path(path)
|
|
|
|
|
|
def ensure_path(path):
|
|
if isinstance(path, basestring_):
|
|
return Path(path)
|
|
else:
|
|
return path
|
|
|
|
|
|
def read_regex(path):
|
|
path = ensure_path(path)
|
|
with path.open() as file_:
|
|
entries = file_.read().split('\n')
|
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_prefix_regex(entries):
|
|
if '(' in entries:
|
|
# Handle deprecated data
|
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
else:
|
|
expression = '|'.join(['^' + piece for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_suffix_regex(entries):
|
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def compile_infix_regex(entries):
|
|
expression = '|'.join([piece for piece in entries if piece.strip()])
|
|
return re.compile(expression)
|
|
|
|
|
|
def normalize_slice(length, start, stop, step=None):
|
|
if not (step is None or step == 1):
|
|
raise ValueError("Stepped slices not supported in Span objects."
|
|
"Try: list(tokens)[start:stop:step] instead.")
|
|
if start is None:
|
|
start = 0
|
|
elif start < 0:
|
|
start += length
|
|
start = min(length, max(0, start))
|
|
|
|
if stop is None:
|
|
stop = length
|
|
elif stop < 0:
|
|
stop += length
|
|
stop = min(length, max(start, stop))
|
|
|
|
assert 0 <= start <= stop <= length
|
|
return start, stop
|
|
|
|
|
|
def check_renamed_kwargs(renamed, kwargs):
|
|
for old, new in renamed.items():
|
|
if old in kwargs:
|
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
|
|
|
|
|
def read_json(location):
|
|
with location.open('r', encoding='utf8') as f:
|
|
return ujson.load(f)
|
|
|
|
|
|
def parse_package_meta(package_path, package, require=True):
|
|
"""
|
|
Check if a meta.json exists in a package and return its contents as a
|
|
dictionary. If require is set to True, raise an error if no meta.json found.
|
|
"""
|
|
# TODO: Allow passing in full model path and only require one argument
|
|
# instead of path and package name. This lets us avoid passing in an awkward
|
|
# empty string in spacy.load() if user supplies full model path.
|
|
location = package_path / package / 'meta.json'
|
|
if location.is_file():
|
|
return read_json(location)
|
|
elif require:
|
|
raise IOError("Could not read meta.json from %s" % location)
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_raw_input(description, default=False):
|
|
"""
|
|
Get user input via raw_input / input and return input value. Takes a
|
|
description for the prompt, and an optional default value that's displayed
|
|
with the prompt.
|
|
"""
|
|
additional = ' (default: {d})'.format(d=default) if default else ''
|
|
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
|
user_input = input_(prompt)
|
|
return user_input
|
|
|
|
|
|
def print_table(data, **kwargs):
|
|
"""
|
|
Print data in table format. Can either take a list of tuples or a
|
|
dictionary, which will be converted to a list of tuples.
|
|
"""
|
|
if type(data) == dict:
|
|
data = list(data.items())
|
|
|
|
tpl_msg = '\n{msg}\n'
|
|
tpl_title = '\n \033[93m{msg}\033[0m'
|
|
tpl_row =" {:<15}" * len(data[0])
|
|
table = '\n'.join([tpl_row.format(l, v) for l, v in data])
|
|
|
|
if 'title' in kwargs and kwargs['title']:
|
|
print(tpl_title.format(msg=kwargs['title']))
|
|
|
|
print(tpl_msg.format(msg=table))
|
|
|
|
|
|
def print_markdown(data, **kwargs):
|
|
"""
|
|
Print listed data in GitHub-flavoured Markdown format so it can be
|
|
copy-pasted into issues. Can either take a list of tuples or a dictionary,
|
|
which will be converted to a list of tuples.
|
|
"""
|
|
def excl_value(value):
|
|
# don't print value if it contains absolute path of directory (i.e.
|
|
# personal info). Other conditions can be included here if necessary.
|
|
if unicode_(Path(__file__).parent) in value:
|
|
return True
|
|
|
|
if type(data) == dict:
|
|
data = list(data.items())
|
|
|
|
tpl_msg = "\n{msg}\n"
|
|
tpl_title = "\n## {msg}"
|
|
tpl_row = "* **{l}:** {v}"
|
|
markdown = '\n'.join([tpl_row.format(l=l, v=v) for l, v in data if not excl_value(v)])
|
|
|
|
if 'title' in kwargs and kwargs['title']:
|
|
print(tpl_title.format(msg=kwargs['title']))
|
|
print(tpl_msg.format(msg=markdown))
|
|
|
|
|
|
def print_msg(*text, **kwargs):
|
|
"""
|
|
Print formatted message. Each positional argument is rendered as newline-
|
|
separated paragraph. If kwarg 'title' exist, title is printed above the text
|
|
and highlighted (using ANSI escape sequences manually to avoid unnecessary
|
|
dependency).
|
|
"""
|
|
message = '\n\n'.join([_wrap_text(t) for t in text])
|
|
tpl_msg = '\n{msg}\n'
|
|
tpl_title = '\n\033[93m{msg}\033[0m'
|
|
|
|
if 'title' in kwargs and kwargs['title']:
|
|
title = _wrap_text(kwargs['title'])
|
|
print(tpl_title.format(msg=title))
|
|
print(tpl_msg.format(msg=message))
|
|
|
|
|
|
def _wrap_text(text):
|
|
"""
|
|
Wrap text at given width using textwrap module. Indent should consist of
|
|
spaces. Its length is deducted from wrap width to ensure exact wrapping.
|
|
"""
|
|
wrap_max = 80
|
|
indent = ' '
|
|
wrap_width = wrap_max - len(indent)
|
|
return textwrap.fill(text, width=wrap_width, initial_indent=indent,
|
|
subsequent_indent=indent, break_long_words=False,
|
|
break_on_hyphens=False)
|
|
|
|
|
|
def sys_exit(*messages, **kwargs):
|
|
"""
|
|
Performs SystemExit. For modules used from the command line, like
|
|
download and link. To print message, use the same arguments as for
|
|
print_msg().
|
|
"""
|
|
if messages:
|
|
print_msg(*messages, **kwargs)
|
|
sys.exit(0)
|