Tidy up deprecated.py

2025-11-08 11:57:39 +03:00 · 2017-05-07 23:29:22 +02:00 · 2017-05-07 23:29:22 +02:00 · b5a726c5cd
commit b5a726c5cd
parent 59c3b9d4dd
1 changed files with 19 additions and 68 deletions
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -7,74 +7,7 @@ from . import about
 from . import util
 from .util import prints
 from .compat import path2str
-from .cli import download
-from .cli import link
-
-
-def read_lang_data(package):
-    tokenization = package.load_json(('tokenizer', 'specials.json'))
-    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
-        prefix = read_prefix(file_) if file_ is not None else None
-    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
-        suffix = read_suffix(file_) if file_ is not None else None
-    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
-        infix = read_infix(file_) if file_ is not None else None
-    return tokenization, prefix, suffix, infix
-
-
-def align_tokens(ref, indices): # Deprecated, surely?
-    start = 0
-    queue = list(indices)
-    for token in ref:
-        end = start + len(token)
-        emit = []
-        while queue and queue[0][1] <= end:
-            emit.append(queue.pop(0))
-        yield token, emit
-        start = end
-    assert not queue
-
-
-def detokenize(token_rules, words): # Deprecated?
-    """
-    To align with treebanks, return a list of "chunks", where a chunk is a
-    sequence of tokens that are separated by whitespace in actual strings. Each
-    chunk should be a tuple of token indices, e.g.
-
-    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
-    [(0,), (1, 2, 3)]
-    """
-    string = ' '.join(words)
-    for subtoks in token_rules:
-        # Algorithmically this is dumb, but writing a little list-based match
-        # machine? Ain't nobody got time for that.
-        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
-    positions = []
-    i = 0
-    for chunk in string.split():
-        subtoks = chunk.split('<SEP>')
-        positions.append(tuple(range(i, i+len(subtoks))))
-        i += len(subtoks)
-    return positions
-
-
-def match_best_version(target_name, target_version, path):
-    path = util.ensure_path(path)
-    if path is None or not path.exists():
-        return None
-    matches = []
-    for data_name in path.iterdir():
-        name, version = split_data_name(data_name.parts[-1])
-        if name == target_name:
-            matches.append((tuple(float(v) for v in version.split('.')), data_name))
-    if matches:
-        return Path(max(matches)[1])
-    else:
-        return None
-
-
-def split_data_name(name):
-    return name.split('-', 1) if '-' in name else (name, '')
+from .cli import download, link


 def fix_glove_vectors_loading(overrides):
@ -106,6 +39,24 @@ def fix_glove_vectors_loading(overrides):
    return overrides


+def match_best_version(target_name, target_version, path):
+    def split_data_name(name):
+        return name.split('-', 1) if '-' in name else (name, '')
+
+    path = util.ensure_path(path)
+    if path is None or not path.exists():
+        return None
+    matches = []
+    for data_name in path.iterdir():
+        name, version = split_data_name(data_name.parts[-1])
+        if name == target_name:
+            matches.append((tuple(float(v) for v in version.split('.')), data_name))
+    if matches:
+        return Path(max(matches)[1])
+    else:
+        return None
+
+
 def resolve_model_name(name):
    """
    If spaCy is loaded with 'de', check if symlink already exists. If