mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Add compat functions and remove old workarounds
Add ensure_path util function to handle checking instance of path
This commit is contained in:
parent
26445ee304
commit
c05ec4b89a
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
||||||
import pip
|
import pip
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import importlib
|
import importlib
|
||||||
|
from ..compat import unicode_, symlink_to
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,23 +44,17 @@ def symlink(model_path, link_name, force):
|
||||||
elif link_path.exists():
|
elif link_path.exists():
|
||||||
link_path.unlink()
|
link_path.unlink()
|
||||||
|
|
||||||
# Add workaround for Python 2 on Windows (see issue #909)
|
|
||||||
if util.is_python2() and util.is_windows():
|
|
||||||
import subprocess
|
|
||||||
command = ['mklink', '/d', unicode(link_path), unicode(model_path)]
|
|
||||||
try:
|
try:
|
||||||
subprocess.call(command, shell=True)
|
symlink_to(link_path, model_path)
|
||||||
except:
|
except:
|
||||||
# This is quite dirty, but just making sure other Windows-specific
|
# This is quite dirty, but just making sure other errors are caught so
|
||||||
# errors are caught so users at least see a proper error message.
|
# users at least see a proper message.
|
||||||
util.sys_exit(
|
util.sys_exit(
|
||||||
"Creating a symlink in spacy/data failed. You can still import "
|
"Creating a symlink in spacy/data failed. You can still import "
|
||||||
"the model as a Python package and call its load() method, or "
|
"the model as a Python package and call its load() method, or "
|
||||||
"create the symlink manually:",
|
"create the symlink manually:",
|
||||||
"{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
|
"{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
|
||||||
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
title="Error: Couldn't link model to '{l}'".format(l=link_name))
|
||||||
else:
|
|
||||||
link_path.symlink_to(model_path)
|
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
"{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
|
||||||
|
|
|
@ -1,20 +1,13 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import json
|
|
||||||
import shutil
|
import shutil
|
||||||
import requests
|
import requests
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import six
|
from ..compat import unicode_, json_dumps
|
||||||
|
|
||||||
from .. import about
|
|
||||||
from .. import util
|
from .. import util
|
||||||
|
|
||||||
if six.PY2:
|
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
|
|
||||||
elif six.PY3:
|
|
||||||
json_dumps = lambda data: json.dumps(data, indent=2)
|
|
||||||
|
|
||||||
def package(input_dir, output_dir, force):
|
def package(input_dir, output_dir, force):
|
||||||
input_path = Path(input_dir)
|
input_path = Path(input_dir)
|
||||||
|
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
|
||||||
package_path = main_path / model_name
|
package_path = main_path / model_name
|
||||||
|
|
||||||
create_dirs(package_path, force)
|
create_dirs(package_path, force)
|
||||||
shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
|
shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
|
||||||
create_file(main_path / 'meta.json', json_dumps(meta))
|
create_file(main_path / 'meta.json', json_dumps(meta))
|
||||||
create_file(main_path / 'setup.py', template_setup)
|
create_file(main_path / 'setup.py', template_setup)
|
||||||
create_file(main_path / 'MANIFEST.in', template_manifest)
|
create_file(main_path / 'MANIFEST.in', template_manifest)
|
||||||
create_file(package_path / '__init__.py', template_init)
|
create_file(package_path / '__init__.py', template_init)
|
||||||
|
|
||||||
util.print_msg(
|
util.print_msg(
|
||||||
main_path.as_posix(),
|
unicode_(main_path),
|
||||||
"To build the package, run `python setup.py sdist` in that directory.",
|
"To build the package, run `python setup.py sdist` in that directory.",
|
||||||
title="Successfully created package {p}".format(p=model_name_v))
|
title="Successfully created package {p}".format(p=model_name_v))
|
||||||
|
|
||||||
|
|
||||||
def check_dirs(input_path, output_path):
|
def check_dirs(input_path, output_path):
|
||||||
if not input_path.exists():
|
if not input_path.exists():
|
||||||
util.sys_exit(input_path.as_poisx(), title="Model directory not found")
|
util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
|
||||||
if not output_path.exists():
|
if not output_path.exists():
|
||||||
util.sys_exit(output_path.as_posix(), title="Output directory not found")
|
util.sys_exit(unicode_(output_path), title="Output directory not found")
|
||||||
|
|
||||||
|
|
||||||
def create_dirs(package_path, force):
|
def create_dirs(package_path, force):
|
||||||
if package_path.exists():
|
if package_path.exists():
|
||||||
if force:
|
if force:
|
||||||
shutil.rmtree(package_path.as_posix())
|
shutil.rmtree(unicode_(package_path.as_posix))
|
||||||
else:
|
else:
|
||||||
util.sys_exit(package_path.as_posix(),
|
util.sys_exit(unicode_(package_path.as_posix),
|
||||||
"Please delete the directory and try again.",
|
"Please delete the directory and try again.",
|
||||||
title="Package directory already exists")
|
title="Package directory already exists")
|
||||||
Path.mkdir(package_path, parents=True)
|
Path.mkdir(package_path, parents=True)
|
||||||
|
|
|
@ -6,12 +6,6 @@ from .cli import download
|
||||||
from .cli import link
|
from .cli import link
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
def read_lang_data(package):
|
def read_lang_data(package):
|
||||||
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
tokenization = package.load_json(('tokenizer', 'specials.json'))
|
||||||
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
|
||||||
|
@ -73,9 +67,7 @@ def fix_glove_vectors_loading(overrides):
|
||||||
if overrides.get('path') in (None, True):
|
if overrides.get('path') in (None, True):
|
||||||
data_path = util.get_data_path()
|
data_path = util.get_data_path()
|
||||||
else:
|
else:
|
||||||
path = overrides['path']
|
path = util.ensure_path(overrides['path'])
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = Path(path)
|
|
||||||
data_path = path.parent
|
data_path = path.parent
|
||||||
vec_path = None
|
vec_path = None
|
||||||
if 'add_vectors' not in overrides:
|
if 'add_vectors' not in overrides:
|
||||||
|
|
|
@ -4,17 +4,6 @@ from contextlib import contextmanager
|
||||||
import shutil
|
import shutil
|
||||||
import ujson
|
import ujson
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
try:
|
|
||||||
unicode
|
|
||||||
except NameError:
|
|
||||||
unicode = str
|
|
||||||
|
|
||||||
from .tokenizer import Tokenizer
|
from .tokenizer import Tokenizer
|
||||||
from .vocab import Vocab
|
from .vocab import Vocab
|
||||||
from .tagger import Tagger
|
from .tagger import Tagger
|
||||||
|
@ -26,6 +15,7 @@ from .syntax.nonproj import PseudoProjectivity
|
||||||
from .pipeline import DependencyParser, EntityRecognizer
|
from .pipeline import DependencyParser, EntityRecognizer
|
||||||
from .syntax.arc_eager import ArcEager
|
from .syntax.arc_eager import ArcEager
|
||||||
from .syntax.ner import BiluoPushDown
|
from .syntax.ner import BiluoPushDown
|
||||||
|
from .compat import unicode_
|
||||||
from .attrs import IS_STOP
|
from .attrs import IS_STOP
|
||||||
from . import attrs
|
from . import attrs
|
||||||
from . import orth
|
from . import orth
|
||||||
|
@ -205,7 +195,7 @@ class Language(object):
|
||||||
directory.mkdir()
|
directory.mkdir()
|
||||||
with (directory / 'config.json').open('wb') as file_:
|
with (directory / 'config.json').open('wb') as file_:
|
||||||
data = ujson.dumps(config, indent=2)
|
data = ujson.dumps(config, indent=2)
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, unicode_):
|
||||||
data = data.encode('utf8')
|
data = data.encode('utf8')
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
if not (path / 'vocab').exists():
|
if not (path / 'vocab').exists():
|
||||||
|
@ -252,9 +242,7 @@ class Language(object):
|
||||||
def __init__(self, **overrides):
|
def __init__(self, **overrides):
|
||||||
if 'data_dir' in overrides and 'path' not in overrides:
|
if 'data_dir' in overrides and 'path' not in overrides:
|
||||||
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
raise ValueError("The argument 'data_dir' has been renamed to 'path'")
|
||||||
path = overrides.get('path', True)
|
path = util.ensure_path(overrides.get('path', True))
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = pathlib.Path(path)
|
|
||||||
if path is True:
|
if path is True:
|
||||||
path = util.get_data_path() / self.lang
|
path = util.get_data_path() / self.lang
|
||||||
if not path.exists() and 'path' not in overrides:
|
if not path.exists() and 'path' not in overrides:
|
||||||
|
|
|
@ -15,6 +15,7 @@ from .tokens.doc cimport Doc
|
||||||
from .attrs cimport TAG
|
from .attrs cimport TAG
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
|
from . import util
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
@ -127,7 +128,7 @@ cdef class Tagger:
|
||||||
"""
|
"""
|
||||||
# TODO: Change this to expect config.json when we don't have to
|
# TODO: Change this to expect config.json when we don't have to
|
||||||
# support old data.
|
# support old data.
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
path = util.ensure_path(path)
|
||||||
if (path / 'templates.json').exists():
|
if (path / 'templates.json').exists():
|
||||||
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
with (path / 'templates.json').open('r', encoding='utf8') as file_:
|
||||||
templates = json.load(file_)
|
templates = json.load(file_)
|
||||||
|
|
|
@ -48,10 +48,8 @@ cdef class Tokenizer:
|
||||||
infix_finditer:
|
infix_finditer:
|
||||||
Signature of re.compile(string).finditer
|
Signature of re.compile(string).finditer
|
||||||
Returns Tokenizer
|
Returns Tokenizer
|
||||||
if isinstance(path, basestring):
|
|
||||||
path = pathlib.Path(path)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
path = util.ensure_path(path)
|
||||||
if rules is None:
|
if rules is None:
|
||||||
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
|
||||||
rules = json.load(file_)
|
rules = json.load(file_)
|
||||||
|
|
|
@ -8,17 +8,7 @@ from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
import textwrap
|
import textwrap
|
||||||
|
|
||||||
|
from .compat import basestring_, unicode_, input_
|
||||||
try:
|
|
||||||
basestring
|
|
||||||
except NameError:
|
|
||||||
basestring = str
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
raw_input
|
|
||||||
except NameError: # Python 3
|
|
||||||
raw_input = input
|
|
||||||
|
|
||||||
|
|
||||||
LANGUAGES = {}
|
LANGUAGES = {}
|
||||||
|
@ -46,9 +36,14 @@ def get_data_path(require_exists=True):
|
||||||
|
|
||||||
def set_data_path(path):
|
def set_data_path(path):
|
||||||
global _data_path
|
global _data_path
|
||||||
if isinstance(path, basestring):
|
_data_path = ensure_path(path)
|
||||||
path = pathlib.Path(path)
|
|
||||||
_data_path = path
|
|
||||||
|
def ensure_path(path):
|
||||||
|
if isinstance(path, basestring_):
|
||||||
|
return Path(path)
|
||||||
|
else:
|
||||||
|
return path
|
||||||
|
|
||||||
|
|
||||||
def or_(val1, val2):
|
def or_(val1, val2):
|
||||||
|
@ -94,7 +89,7 @@ def constraint_match(constraint_string, version):
|
||||||
|
|
||||||
|
|
||||||
def read_regex(path):
|
def read_regex(path):
|
||||||
path = path if not isinstance(path, basestring) else pathlib.Path(path)
|
path = ensure_path(path)
|
||||||
with path.open() as file_:
|
with path.open() as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
|
@ -151,16 +146,6 @@ def check_renamed_kwargs(renamed, kwargs):
|
||||||
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
|
||||||
|
|
||||||
|
|
||||||
def is_windows():
|
|
||||||
"""Check if user is on Windows."""
|
|
||||||
return sys.platform.startswith('win')
|
|
||||||
|
|
||||||
|
|
||||||
def is_python2():
|
|
||||||
"""Check if Python 2 is used."""
|
|
||||||
return sys.version.startswith('2.')
|
|
||||||
|
|
||||||
|
|
||||||
def parse_package_meta(package_path, package, require=True):
|
def parse_package_meta(package_path, package, require=True):
|
||||||
location = package_path / package / 'meta.json'
|
location = package_path / package / 'meta.json'
|
||||||
if location.is_file():
|
if location.is_file():
|
||||||
|
@ -180,7 +165,7 @@ def get_raw_input(description, default=False):
|
||||||
|
|
||||||
additional = ' (default: {d})'.format(d=default) if default else ''
|
additional = ' (default: {d})'.format(d=default) if default else ''
|
||||||
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
prompt = ' {d}{a}: '.format(d=description, a=additional)
|
||||||
user_input = raw_input(prompt)
|
user_input = input_(prompt)
|
||||||
return user_input
|
return user_input
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,6 @@ import bz2
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
try:
|
|
||||||
import cPickle as pickle
|
|
||||||
except ImportError:
|
|
||||||
import pickle
|
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
from libc.stdint cimport int32_t
|
from libc.stdint cimport int32_t
|
||||||
from libc.math cimport sqrt
|
from libc.math cimport sqrt
|
||||||
|
@ -23,10 +18,7 @@ from .tokens.token cimport Token
|
||||||
from .serialize.packer cimport Packer
|
from .serialize.packer cimport Packer
|
||||||
from .attrs cimport PROB, LANG
|
from .attrs cimport PROB, LANG
|
||||||
|
|
||||||
try:
|
from .compat import copy_reg, pickle
|
||||||
import copy_reg
|
|
||||||
except ImportError:
|
|
||||||
import copyreg as copy_reg
|
|
||||||
from .lemmatizer import Lemmatizer
|
from .lemmatizer import Lemmatizer
|
||||||
from .attrs import intify_attrs
|
from .attrs import intify_attrs
|
||||||
from . import util
|
from . import util
|
||||||
|
@ -69,8 +61,7 @@ cdef class Vocab:
|
||||||
Returns:
|
Returns:
|
||||||
Vocab: The newly constructed vocab object.
|
Vocab: The newly constructed vocab object.
|
||||||
"""
|
"""
|
||||||
if isinstance(path, basestring):
|
path = util.ensure_path(path)
|
||||||
path = Path(path)
|
|
||||||
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
|
||||||
if 'vectors' in deprecated_kwargs:
|
if 'vectors' in deprecated_kwargs:
|
||||||
raise AttributeError(
|
raise AttributeError(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user