spaCy/spacy/util.py

129 lines
3.5 KiB
Python
Raw Normal View History

2015-12-07 08:01:28 +03:00
import os
import io
2015-01-05 09:54:13 +03:00
import json
2014-09-25 20:26:22 +04:00
import re
import os.path
2016-09-24 21:26:17 +03:00
import pathlib
2015-12-07 08:01:28 +03:00
import six
2016-01-13 21:46:17 +03:00
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
2016-09-24 23:08:43 +03:00
try:
basestring
except NameError:
basestring = str
2016-03-25 20:54:45 +03:00
LANGUAGES = {}
2016-09-24 21:26:17 +03:00
_data_path = pathlib.Path(__file__).parent / 'data'
2016-03-25 20:54:45 +03:00
2016-03-26 13:44:53 +03:00
def set_lang_class(name, cls):
2016-03-25 20:54:45 +03:00
global LANGUAGES
LANGUAGES[name] = cls
2016-03-26 13:44:53 +03:00
def get_lang_class(name):
2016-05-17 02:40:31 +03:00
lang = re.split('[^a-zA-Z0-9]', name, 1)[0]
2016-03-25 20:54:45 +03:00
if lang not in LANGUAGES:
raise RuntimeError('Language not supported: %s' % lang)
return LANGUAGES[lang]
2016-09-24 21:26:17 +03:00
def get_data_path():
return _data_path
def set_data_path(path):
global _data_path
if isinstance(path, basestring):
path = pathlib.Path(path)
_data_path = path
def match_best_version(target_name, target_version, path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
matches = []
for data_name in path.iterdir():
name, version = split_data_name(data_name.parts[-1])
if name == target_name and constraint_match(target_version, version):
matches.append((tuple(float(v) for v in version.split('.')), data_name))
if matches:
return pathlib.Path(max(matches)[1])
else:
return None
def split_data_name(name):
return name.split('-', 1) if '-' in name else (name, '')
def constraint_match(constraint_string, version):
# From http://github.com/spacy-io/sputnik
if not constraint_string:
return True
constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
for c in constraints:
if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
raise ValueError('invalid constraint: %s' % c)
return all(semver.match(version, c) for c in constraints)
def read_regex(path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
def read_prefix_regex(path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return re.compile(expression)
def read_suffix_regex(path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return re.compile(expression)
def read_infix_regex(path):
path = path if not isinstance(path, basestring) else pathlib.Path(path)
with path.open() as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece for piece in entries if piece.strip()])
return re.compile(expression)
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
if start is None:
start = 0
elif start < 0:
start += length
start = min(length, max(0, start))
if stop is None:
stop = length
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
2014-09-25 20:26:22 +04:00
def utf8open(loc, mode='r'):
return io.open(loc, mode, encoding='utf8')