Add compat functions and remove old workarounds

Add ensure_path util function to handle checking instance of path
2025-11-04 01:48:04 +03:00 · 2017-04-15 12:11:16 +02:00 · 2017-04-15 12:11:16 +02:00 · c05ec4b89a
commit c05ec4b89a
parent 26445ee304
8 changed files with 39 additions and 96 deletions
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@ -4,6 +4,7 @@ from __future__ import unicode_literals
 import pip
 from pathlib import Path
 import importlib
 from ..compat import unicode_, symlink_to
 from .. import util
@ -43,23 +44,17 @@ def symlink(model_path, link_name, force):
    elif link_path.exists():
        link_path.unlink()
    # Add workaround for Python 2 on Windows (see issue #909)
    if util.is_python2() and util.is_windows():
        import subprocess
        command = ['mklink', '/d', unicode(link_path), unicode(model_path)]
    try:
-            subprocess.call(command, shell=True)
+        symlink_to(link_path, model_path)
    except:
-            # This is quite dirty, but just making sure other Windows-specific
+        # This is quite dirty, but just making sure other errors are caught so
-            # errors are caught so users at least see a proper error message.
+        # users at least see a proper message.
        util.sys_exit(
            "Creating a symlink in spacy/data failed. You can still import "
            "the model as a Python package and call its load() method, or "
            "create the symlink manually:",
-                "{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)),
+            "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)),
            title="Error: Couldn't link model to '{l}'".format(l=link_name))
    else:
        link_path.symlink_to(model_path)
    util.print_msg(
        "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()),
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -1,20 +1,13 @@
 # coding: utf8
 from __future__ import unicode_literals
 import json
 import shutil
 import requests
 from pathlib import Path
-import six
+from ..compat import unicode_, json_dumps
 from .. import about
 from .. import util
 if six.PY2:
    json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8")
 elif six.PY3:
    json_dumps = lambda data: json.dumps(data, indent=2)
 def package(input_dir, output_dir, force):
    input_path = Path(input_dir)
@ -32,31 +25,31 @@ def package(input_dir, output_dir, force):
    package_path = main_path / model_name
    create_dirs(package_path, force)
-    shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix())
+    shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v))
    create_file(main_path / 'meta.json', json_dumps(meta))
    create_file(main_path / 'setup.py', template_setup)
    create_file(main_path / 'MANIFEST.in', template_manifest)
    create_file(package_path / '__init__.py', template_init)
    util.print_msg(
-        main_path.as_posix(),
+        unicode_(main_path),
        "To build the package, run `python setup.py sdist` in that directory.",
        title="Successfully created package {p}".format(p=model_name_v))
 def check_dirs(input_path, output_path):
    if not input_path.exists():
-        util.sys_exit(input_path.as_poisx(), title="Model directory not found")
+        util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found")
    if not output_path.exists():
-        util.sys_exit(output_path.as_posix(), title="Output directory not found")
+        util.sys_exit(unicode_(output_path), title="Output directory not found")
 def create_dirs(package_path, force):
    if package_path.exists():
        if force:
-            shutil.rmtree(package_path.as_posix())
+            shutil.rmtree(unicode_(package_path.as_posix))
        else:
-            util.sys_exit(package_path.as_posix(),
+            util.sys_exit(unicode_(package_path.as_posix),
                "Please delete the directory and try again.",
                title="Package directory already exists")
    Path.mkdir(package_path, parents=True)
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -6,12 +6,6 @@ from .cli import download
 from .cli import link
 try:
    basestring
 except NameError:
    basestring = str
 def read_lang_data(package):
    tokenization = package.load_json(('tokenizer', 'specials.json'))
    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
@ -73,9 +67,7 @@ def fix_glove_vectors_loading(overrides):
    if overrides.get('path') in (None, True):
        data_path = util.get_data_path()
    else:
-        path = overrides['path']
+        path = util.ensure_path(overrides['path'])
        if isinstance(path, basestring):
            path = Path(path)
        data_path = path.parent
    vec_path = None
    if 'add_vectors' not in overrides:
--- a/spacy/language.py
+++ b/spacy/language.py
@ -4,17 +4,6 @@ from contextlib import contextmanager
 import shutil
 import ujson
 try:
    basestring
 except NameError:
    basestring = str
 try:
    unicode
 except NameError:
    unicode = str
 from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .tagger import Tagger
@ -26,6 +15,7 @@ from .syntax.nonproj import PseudoProjectivity
 from .pipeline import DependencyParser, EntityRecognizer
 from .syntax.arc_eager import ArcEager
 from .syntax.ner import BiluoPushDown
 from .compat import unicode_
 from .attrs import IS_STOP
 from . import attrs
 from . import orth
@ -205,7 +195,7 @@ class Language(object):
            directory.mkdir()
            with (directory / 'config.json').open('wb') as file_:
                data = ujson.dumps(config, indent=2)
-                if isinstance(data, unicode):
+                if isinstance(data, unicode_):
                    data = data.encode('utf8')
                file_.write(data)
        if not (path / 'vocab').exists():
@ -252,9 +242,7 @@ class Language(object):
    def __init__(self, **overrides):
        if 'data_dir' in overrides and 'path' not in overrides:
            raise ValueError("The argument 'data_dir' has been renamed to 'path'")
-        path = overrides.get('path', True)
+        path = util.ensure_path(overrides.get('path', True))
        if isinstance(path, basestring):
            path = pathlib.Path(path)
        if path is True:
            path = util.get_data_path() / self.lang
            if not path.exists() and 'path' not in overrides:
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -15,6 +15,7 @@ from .tokens.doc cimport Doc
 from .attrs cimport TAG
 from .gold cimport GoldParse
 from .attrs cimport *
 from . import util
 cpdef enum:
@ -127,7 +128,7 @@ cdef class Tagger:
        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
-        path = path if not isinstance(path, basestring) else pathlib.Path(path)
+        path = util.ensure_path(path)
        if (path / 'templates.json').exists():
            with (path / 'templates.json').open('r', encoding='utf8') as file_:
                templates = json.load(file_)
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -48,10 +48,8 @@ cdef class Tokenizer:
            infix_finditer:
                Signature of re.compile(string).finditer
        Returns Tokenizer
        if isinstance(path, basestring):
            path = pathlib.Path(path)
        """
        path = util.ensure_path(path)
        if rules is None:
            with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_:
                rules = json.load(file_)
--- a/spacy/util.py
+++ b/spacy/util.py
@ -8,17 +8,7 @@ from pathlib import Path
 import sys
 import textwrap
-
+from .compat import basestring_, unicode_, input_
 try:
    basestring
 except NameError:
    basestring = str
 try:
    raw_input
 except NameError: # Python 3
    raw_input = input
 LANGUAGES = {}
@ -46,9 +36,14 @@ def get_data_path(require_exists=True):
 def set_data_path(path):
    global _data_path
-    if isinstance(path, basestring):
+    _data_path = ensure_path(path)
-        path = pathlib.Path(path)
+
-    _data_path = path
+
 def ensure_path(path):
    if isinstance(path, basestring_):
        return Path(path)
    else:
        return path
 def or_(val1, val2):
@ -94,7 +89,7 @@ def constraint_match(constraint_string, version):
 def read_regex(path):
-    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    path = ensure_path(path)
    with path.open() as file_:
        entries = file_.read().split('\n')
    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
@ -151,16 +146,6 @@ def check_renamed_kwargs(renamed, kwargs):
            raise TypeError("Keyword argument %s now renamed to %s" % (old, new))
 def is_windows():
    """Check if user is on Windows."""
    return sys.platform.startswith('win')
 def is_python2():
    """Check if Python 2 is used."""
    return sys.version.startswith('2.')
 def parse_package_meta(package_path, package, require=True):
    location = package_path / package / 'meta.json'
    if location.is_file():
@ -180,7 +165,7 @@ def get_raw_input(description, default=False):
    additional = ' (default: {d})'.format(d=default) if default else ''
    prompt = '    {d}{a}: '.format(d=description, a=additional)
-    user_input = raw_input(prompt)
+    user_input = input_(prompt)
    return user_input
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -5,11 +5,6 @@ import bz2
 import ujson as json
 import re
 try:
    import cPickle as pickle
 except ImportError:
    import pickle
 from libc.string cimport memset
 from libc.stdint cimport int32_t
 from libc.math cimport sqrt
@ -23,10 +18,7 @@ from .tokens.token cimport Token
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
-try:
+from .compat import copy_reg, pickle
    import copy_reg
 except ImportError:
    import copyreg as copy_reg
 from .lemmatizer import Lemmatizer
 from .attrs import intify_attrs
 from . import util
@ -69,8 +61,7 @@ cdef class Vocab:
        Returns:
            Vocab: The newly constructed vocab object.
        """
-        if isinstance(path, basestring):
+        path = util.ensure_path(path)
            path = Path(path)
        util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs)
        if 'vectors' in deprecated_kwargs:
            raise AttributeError(