diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index e3ce53024..000000000 --- a/.travis.yml +++ /dev/null @@ -1,23 +0,0 @@ -language: python -sudo: false -cache: pip -dist: trusty -group: edge -python: - - "2.7" -os: - - linux -install: - - "pip install -r requirements.txt" - - "python setup.py build_ext --inplace" - - "pip install -e ." -script: - - "cat /proc/cpuinfo | grep flags | head -n 1" - - "python -m pytest --tb=native spacy" -branches: - except: - - spacy.io -notifications: - slack: - secure: F8GvqnweSdzImuLL64TpfG0i5rYl89liyr9tmFVsHl4c0DNiDuGhZivUz0M1broS8svE3OPOllLfQbACG/4KxD890qfF9MoHzvRDlp7U+RtwMV/YAkYn8MGWjPIbRbX0HpGdY7O2Rc9Qy4Kk0T8ZgiqXYIqAz2Eva9/9BlSmsJQ= - email: false diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3c2b56cd3..6b7881dd2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -280,23 +280,7 @@ except: # noqa: E722 ### Python conventions -All Python code must be written in an **intersection of Python 2 and Python 3**. -This is easy in Cython, but somewhat ugly in Python. Logic that deals with -Python or platform compatibility should only live in -[`spacy.compat`](spacy/compat.py). To distinguish them from the builtin -functions, replacement functions are suffixed with an underscore, for example -`unicode_`. If you need to access the user's version or platform information, -for example to show more specific error messages, you can use the `is_config()` -helper function. - -```python -from .compat import unicode_, is_config - -compatible_unicode = unicode_('hello world') -if is_config(windows=True, python2=True): - print("You are using Python 2 on Windows.") -``` - +All Python code must be written **compatible with Python 3.6+**. Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check diff --git a/README.md b/README.md index 74d2d2166..500431b9f 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,6 @@ It's commercial open-source software, released under the MIT license. [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines]()](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) -[![Travis Build Status]()](https://travis-ci.org/explosion/spaCy) [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square&logo=github)](https://github.com/explosion/spaCy/releases) [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy/) [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square&logo=conda-forge&logoColor=white)](https://anaconda.org/conda-forge/spacy) @@ -98,7 +97,7 @@ For detailed installation instructions, see the - **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual Studio) -- **Python version**: Python 2.7, 3.5+ (only 64 bit) +- **Python version**: Python 3.6+ (only 64 bit) - **Package managers**: [pip] · [conda] (via `conda-forge`) [pip]: https://pypi.org/project/spacy/ @@ -269,9 +268,7 @@ and git preinstalled. Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://visualstudio.microsoft.com/vs/express/) that -matches the version that was used to compile your Python interpreter. For -official distributions these are VS 2008 (Python 2.7), VS 2010 (Python 3.4) and -VS 2015 (Python 3.5). +matches the version that was used to compile your Python interpreter. ## Run tests diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 054365336..d34da39f7 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -35,12 +35,6 @@ jobs: dependsOn: 'Validate' strategy: matrix: - Python35Linux: - imageName: 'ubuntu-16.04' - python.version: '3.5' - Python35Windows: - imageName: 'vs2017-win2016' - python.version: '3.5' Python36Linux: imageName: 'ubuntu-16.04' python.version: '3.6' diff --git a/bin/cythonize.py b/bin/cythonize.py index 4814f8df0..554252294 100755 --- a/bin/cythonize.py +++ b/bin/cythonize.py @@ -38,14 +38,14 @@ import argparse HASH_FILE = "cythonize.json" -def process_pyx(fromfile, tofile, language_level="-2"): +def process_pyx(fromfile, tofile, language_level="-3"): print("Processing %s" % fromfile) try: from Cython.Compiler.Version import version as cython_version from distutils.version import LooseVersion - if LooseVersion(cython_version) < LooseVersion("0.19"): - raise Exception("Require Cython >= 0.19") + if LooseVersion(cython_version) < LooseVersion("0.25"): + raise Exception("Require Cython >= 0.25") except ImportError: pass diff --git a/fabfile.py b/fabfile.py index fcab493f5..460471747 100644 --- a/fabfile.py +++ b/fabfile.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, print_function - import contextlib from pathlib import Path from fabric.api import local, lcd, env, settings, prefix diff --git a/requirements.txt b/requirements.txt index 1786ee186..188459c67 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,7 +11,6 @@ catalogue>=0.0.7,<1.1.0 numpy>=1.15.0 requests>=2.13.0,<3.0.0 plac>=0.9.6,<1.2.0 -pathlib==1.0.1; python_version < "3.4" tqdm>=4.38.0,<5.0.0 # Optional dependencies jsonschema>=2.6.0,<3.1.0 diff --git a/setup.cfg b/setup.cfg index a0103c5a2..28259c989 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,10 +16,7 @@ classifiers = Operating System :: MacOS :: MacOS X Operating System :: Microsoft :: Windows Programming Language :: Cython - Programming Language :: Python :: 2 - Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 @@ -30,7 +27,7 @@ zip_safe = false include_package_data = true scripts = bin/spacy -python_requires = >=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.* +python_requires = >=3.6 setup_requires = wheel cython>=0.25 @@ -54,7 +51,6 @@ install_requires = numpy>=1.15.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - pathlib==1.0.1; python_version < "3.4" [options.extras_require] lookups = diff --git a/setup.py b/setup.py index 62a09aa73..1afdc7ae4 100755 --- a/setup.py +++ b/setup.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -from __future__ import print_function import io import os import subprocess diff --git a/spacy/__init__.py b/spacy/__init__.py index 4a0d16a49..49db0e3b5 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals import warnings import sys diff --git a/spacy/__main__.py b/spacy/__main__.py index 2c285095e..06ba5704d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -1,9 +1,3 @@ -# coding: utf8 -from __future__ import print_function - -# NB! This breaks in plac on Python 2!! -# from __future__ import unicode_literals - if __name__ == "__main__": import plac import sys @@ -32,5 +26,5 @@ if __name__ == "__main__": if command in commands: plac.call(commands[command], sys.argv[1:]) else: - available = "Available: {}".format(", ".join(commands)) - msg.fail("Unknown command: {}".format(command), available, exits=1) + available = f"Available: {', '.join(commands)}" + msg.fail(f"Unknown command: {command}", available, exits=1) diff --git a/spacy/_ml.py b/spacy/_ml.py index 8695a88cc..a1d2b6b77 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu from thinc.t2t import ExtractWindow, ParametricAttention diff --git a/spacy/analysis.py b/spacy/analysis.py index 761be3de9..ed6d6b18e 100644 --- a/spacy/analysis.py +++ b/spacy/analysis.py @@ -1,7 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict from wasabi import Printer from .tokens import Doc, Token, Span @@ -23,7 +19,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True): assert pipeline[index][0] == name prev_pipes = pipeline[:index] pipe_requires = getattr(pipe, "requires", []) - requires = OrderedDict([(annot, False) for annot in pipe_requires]) + requires = {annot: False for annot in pipe_requires} if requires: for prev_name, prev_pipe in prev_pipes: prev_assigns = getattr(prev_pipe, "assigns", []) @@ -98,15 +94,15 @@ def validate_attrs(values): for ext_attr, ext_value in value.items(): # We don't check whether the attribute actually exists if ext_value is not True: # attr is something like doc._.x.y - good = "{}._.{}".format(obj_key, ext_attr) - bad = "{}.{}".format(good, ".".join(ext_value)) + good = f"{obj_key}._.{ext_attr}" + bad = f"{good}.{'.'.join(ext_value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) continue # we can't validate those further if attr.endswith("_"): # attr is something like "token.pos_" raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1])) if value is not True: # attr is something like doc.x.y - good = "{}.{}".format(obj_key, attr) - bad = "{}.{}".format(good, ".".join(value)) + good = f"{obj_key}.{attr}" + bad = f"{good}.{'.'.join(value)}" raise ValueError(Errors.E183.format(attr=bad, solution=good)) obj = objs[obj_key] if not hasattr(obj, attr): @@ -168,11 +164,10 @@ def print_summary(nlp, pretty=True, no_print=False): msg.table(overview, header=header, divider=True, multiline=True) n_problems = sum(len(p) for p in problems.values()) if any(p for p in problems.values()): - msg.divider("Problems ({})".format(n_problems)) + msg.divider(f"Problems ({n_problems})") for name, problem in problems.items(): if problem: - problem = ", ".join(problem) - msg.warn("'{}' requirements not met: {}".format(name, problem)) + msg.warn(f"'{name}' requirements not met: {', '.join(problem)}") else: msg.good("No problems found.") if no_print: diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 6d1c18eb9..a601a7a66 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NULL_ATTR, diff --git a/spacy/cli/_schemas.py b/spacy/cli/_schemas.py index 3fb2c8979..42e5e04dd 100644 --- a/spacy/cli/_schemas.py +++ b/spacy/cli/_schemas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - # NB: This schema describes the new format of the training data, see #2928 TRAINING_SCHEMA = { diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 0cc0693a8..d8c8a7a18 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac from pathlib import Path from wasabi import Printer @@ -30,16 +27,18 @@ FILE_TYPES_STDOUT = ("json", "jsonl") @plac.annotations( + # fmt: off input_file=("Input file", "positional", None, str), output_dir=("Output directory. '-' for stdout.", "positional", None, str), - file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str), + file_type=(f"Type of data to produce: {FILE_TYPES}", "option", "t", str, FILE_TYPES), n_sents=("Number of sentences per doc (0 to disable)", "option", "n", int), seg_sents=("Segment sentences (for -c ner)", "flag", "s"), model=("Model for sentence segmentation (for -s)", "option", "b", str), - converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str), + converter=(f"Converter: {tuple(CONVERTERS.keys())}", "option", "c", str), lang=("Language (if tokenizer required)", "option", "l", str), morphology=("Enable appending morphology to tags", "flag", "m", bool), - ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path), + ner_map_path=("NER tag mapping (as JSON-encoded dict of entity types)", "option", "N", Path,), + # fmt: on ) def convert( input_file, @@ -62,16 +61,10 @@ def convert( no_print = output_dir == "-" msg = Printer(no_print=no_print) input_path = Path(input_file) - if file_type not in FILE_TYPES: - msg.fail( - "Unknown file type: '{}'".format(file_type), - "Supported file types: '{}'".format(", ".join(FILE_TYPES)), - exits=1, - ) if file_type not in FILE_TYPES_STDOUT and output_dir == "-": # TODO: support msgpack via stdout in srsly? msg.fail( - "Can't write .{} data to stdout.".format(file_type), + f"Can't write .{file_type} data to stdout", "Please specify an output directory.", exits=1, ) @@ -95,7 +88,7 @@ def convert( "Can't automatically detect NER format. Conversion may not succeed. See https://spacy.io/api/cli#convert" ) if converter not in CONVERTERS: - msg.fail("Can't find converter for {}".format(converter), exits=1) + msg.fail(f"Can't find converter for {converter}", exits=1) ner_map = None if ner_map_path is not None: ner_map = srsly.read_json(ner_map_path) @@ -113,7 +106,7 @@ def convert( ) if output_dir != "-": # Export data to a file - suffix = ".{}".format(file_type) + suffix = f".{file_type}" output_file = Path(output_dir) / Path(input_path.parts[-1]).with_suffix(suffix) if file_type == "json": srsly.write_json(output_file, data) @@ -121,9 +114,7 @@ def convert( srsly.write_jsonl(output_file, data) elif file_type == "msg": srsly.write_msgpack(output_file, data) - msg.good( - "Generated output file ({} documents): {}".format(len(data), output_file) - ) + msg.good(f"Generated output file ({len(data)} documents): {output_file}") else: # Print to stdout if file_type == "json": diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index 46489ad7c..b607d5913 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo @@ -64,9 +61,9 @@ def conll_ner2json( # sentence segmentation required for document segmentation if n_sents > 0 and not seg_sents: msg.warn( - "No sentence boundaries found to use with option `-n {}`. " - "Use `-s` to automatically segment sentences or `-n 0` " - "to disable.".format(n_sents) + f"No sentence boundaries found to use with option `-n {n_sents}`. " + f"Use `-s` to automatically segment sentences or `-n 0` " + f"to disable." ) else: n_sents_info(msg, n_sents) @@ -129,7 +126,7 @@ def segment_sents_and_docs(doc, n_sents, doc_delimiter, model=None, msg=None): if model: nlp = load_model(model) if "parser" in nlp.pipe_names: - msg.info("Segmenting sentences with parser from model '{}'.".format(model)) + msg.info(f"Segmenting sentences with parser from model '{model}'.") sentencizer = nlp.get_pipe("parser") if not sentencizer: msg.info( @@ -166,7 +163,7 @@ def segment_docs(input_data, n_sents, doc_delimiter): def n_sents_info(msg, n_sents): - msg.info("Grouping every {} sentences into a document.".format(n_sents)) + msg.info(f"Grouping every {n_sents} sentences into a document.") if n_sents == 1: msg.warn( "To generate better training data, you may want to group " diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 3febd07d1..12b1103d4 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from spacy.gold import Example diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 61c398f8d..b6ac234fc 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from wasabi import Printer from ...gold import iob_to_biluo diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 1c1bc45c7..525063b22 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from ...gold import docs_to_json diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index c2af5bff0..2e780f53c 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path from collections import Counter import plac @@ -23,20 +20,17 @@ BLANK_MODEL_THRESHOLD = 2000 @plac.annotations( + # fmt: off lang=("model language", "positional", None, str), train_path=("location of JSON-formatted training data", "positional", None, Path), dev_path=("location of JSON-formatted development data", "positional", None, Path), tag_map_path=("Location of JSON-formatted tag map", "option", "tm", Path), base_model=("name of model to update (optional)", "option", "b", str), - pipeline=( - "Comma-separated names of pipeline components to train", - "option", - "p", - str, - ), + pipeline=("Comma-separated names of pipeline components to train", "option", "p", str), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), verbose=("Print additional information and explanations", "flag", "V", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool), + # fmt: on ) def debug_data( lang, @@ -93,15 +87,11 @@ def debug_data( corpus.train_dataset_without_preprocessing(nlp) ) except ValueError as e: - loading_train_error_message = "Training data cannot be loaded: {}".format( - str(e) - ) + loading_train_error_message = f"Training data cannot be loaded: {e}" try: dev_dataset = list(corpus.dev_dataset(nlp)) except ValueError as e: - loading_dev_error_message = "Development data cannot be loaded: {}".format( - str(e) - ) + loading_dev_error_message = f"Development data cannot be loaded: {e}" if loading_train_error_message or loading_dev_error_message: if loading_train_error_message: msg.fail(loading_train_error_message) @@ -112,78 +102,66 @@ def debug_data( # Create all gold data here to avoid iterating over the train_dataset constantly gold_train_data = _compile_gold(train_dataset, pipeline) - gold_train_unpreprocessed_data = _compile_gold(train_dataset_unpreprocessed, pipeline) + gold_train_unpreprocessed_data = _compile_gold( + train_dataset_unpreprocessed, pipeline + ) gold_dev_data = _compile_gold(dev_dataset, pipeline) train_texts = gold_train_data["texts"] dev_texts = gold_dev_data["texts"] msg.divider("Training stats") - msg.text("Training pipeline: {}".format(", ".join(pipeline))) + msg.text(f"Training pipeline: {', '.join(pipeline)}") for pipe in [p for p in pipeline if p not in nlp.factories]: - msg.fail("Pipeline component '{}' not available in factories".format(pipe)) + msg.fail(f"Pipeline component '{pipe}' not available in factories") if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") else: - msg.text("Starting with blank model '{}'".format(lang)) - msg.text("{} training docs".format(len(train_dataset))) - msg.text("{} evaluation docs".format(len(gold_dev_data))) + msg.text(f"Starting with blank model '{lang}'") + msg.text(f"{len(train_dataset)} training docs") + msg.text(f"{len(gold_dev_data)} evaluation docs") if not len(gold_dev_data): msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: - msg.warn("{} training examples also in evaluation data".format(overlap)) + msg.warn(f"{overlap} training examples also in evaluation data") else: msg.good("No overlap between training and evaluation data") if not base_model and len(train_dataset) < BLANK_MODEL_THRESHOLD: - text = "Low number of examples to train from a blank model ({})".format( - len(train_dataset) + text = ( + f"Low number of examples to train from a blank model ({len(train_dataset)})" ) if len(train_dataset) < BLANK_MODEL_MIN_THRESHOLD: msg.fail(text) else: msg.warn(text) msg.text( - "It's recommended to use at least {} examples (minimum {})".format( - BLANK_MODEL_THRESHOLD, BLANK_MODEL_MIN_THRESHOLD - ), + f"It's recommended to use at least {BLANK_MODEL_THRESHOLD} examples " + f"(minimum {BLANK_MODEL_MIN_THRESHOLD})", show=verbose, ) msg.divider("Vocab & Vectors") n_words = gold_train_data["n_words"] msg.info( - "{} total {} in the data ({} unique)".format( - n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]) - ) + f"{n_words} total word(s) in the data ({len(gold_train_data['words'])} unique)" ) if gold_train_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the training data".format( - gold_train_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_train_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the training data") if gold_dev_data["n_misaligned_words"] > 0: - msg.warn( - "{} misaligned tokens in the dev data".format( - gold_dev_data["n_misaligned_words"] - ) - ) + n_misaligned = gold_dev_data["n_misaligned_words"] + msg.warn(f"{n_misaligned} misaligned tokens in the dev data") most_common_words = gold_train_data["words"].most_common(10) msg.text( - "10 most common words: {}".format( - _format_labels(most_common_words, counts=True) - ), + f"10 most common words: {_format_labels(most_common_words, counts=True)}", show=verbose, ) if len(nlp.vocab.vectors): msg.info( - "{} vectors ({} unique keys, {} dimensions)".format( - len(nlp.vocab.vectors), - nlp.vocab.vectors.n_keys, - nlp.vocab.vectors_length, - ) + f"{len(nlp.vocab.vectors)} vectors ({nlp.vocab.vectors.n_keys} " + f"unique keys, {nlp.vocab.vectors_length} dimensions)" ) else: msg.info("No word vectors present in the model") @@ -203,19 +181,10 @@ def debug_data( msg.divider("Named Entity Recognition") msg.info( - "{} new {}, {} existing {}".format( - len(new_labels), - "label" if len(new_labels) == 1 else "labels", - len(existing_labels), - "label" if len(existing_labels) == 1 else "labels", - ) + f"{len(new_labels)} new label(s), {len(existing_labels)} existing label(s)" ) missing_values = label_counts["-"] - msg.text( - "{} missing {} (tokens with '-' label)".format( - missing_values, "value" if missing_values == 1 else "values" - ) - ) + msg.text(f"{missing_values} missing value(s) (tokens with '-' label)") for label in new_labels: if len(label) == 0: msg.fail("Empty label found in new labels") @@ -226,33 +195,24 @@ def debug_data( if label != "-" ] labels_with_counts = _format_labels(labels_with_counts, counts=True) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) - + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if gold_train_data["ws_ents"]: - msg.fail( - "{} invalid whitespace entity spans".format(gold_train_data["ws_ents"]) - ) + msg.fail(f"{gold_train_data['ws_ents']} invalid whitespace entity spans") has_ws_ents_error = True for label in new_labels: if label_counts[label] <= NEW_LABEL_THRESHOLD: msg.warn( - "Low number of examples for new label '{}' ({})".format( - label, label_counts[label] - ) + f"Low number of examples for new label '{label}' ({label_counts[label]})" ) has_low_data_warning = True with msg.loading("Analyzing label distribution..."): neg_docs = _get_examples_without_label(train_dataset, label) if neg_docs == 0: - msg.warn( - "No examples for texts WITHOUT new label '{}'".format(label) - ) + msg.warn(f"No examples for texts WITHOUT new label '{label}'") has_no_neg_warning = True if not has_low_data_warning: @@ -264,8 +224,8 @@ def debug_data( if has_low_data_warning: msg.text( - "To train a new entity type, your data should include at " - "least {} instances of the new label".format(NEW_LABEL_THRESHOLD), + f"To train a new entity type, your data should include at " + f"least {NEW_LABEL_THRESHOLD} instances of the new label", show=verbose, ) if has_no_neg_warning: @@ -288,27 +248,21 @@ def debug_data( new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] msg.info( - "Text Classification: {} new label(s), {} existing label(s)".format( - len(new_labels), len(existing_labels) - ) + f"Text Classification: {len(new_labels)} new label(s), " + f"{len(existing_labels)} existing label(s)" ) if new_labels: labels_with_counts = _format_labels( gold_train_data["cats"].most_common(), counts=True ) - msg.text("New: {}".format(labels_with_counts), show=verbose) + msg.text(f"New: {labels_with_counts}", show=verbose) if existing_labels: - msg.text( - "Existing: {}".format(_format_labels(existing_labels)), show=verbose - ) + msg.text(f"Existing: {_format_labels(existing_labels)}", show=verbose) if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]): msg.fail( - "The train and dev labels are not the same. " - "Train labels: {}. " - "Dev labels: {}.".format( - _format_labels(gold_train_data["cats"]), - _format_labels(gold_dev_data["cats"]), - ) + f"The train and dev labels are not the same. " + f"Train labels: {_format_labels(gold_train_data['cats'])}. " + f"Dev labels: {_format_labels(gold_dev_data['cats'])}." ) if gold_train_data["n_cats_multilabel"] > 0: msg.info( @@ -338,27 +292,16 @@ def debug_data( msg.divider("Part-of-speech Tagging") labels = [label for label in gold_train_data["tags"]] tag_map = nlp.vocab.morphology.tag_map - msg.info( - "{} {} in data ({} {} in tag map)".format( - len(labels), - "label" if len(labels) == 1 else "labels", - len(tag_map), - "label" if len(tag_map) == 1 else "labels", - ) - ) + msg.info(f"{len(labels)} label(s) in data ({len(tag_map)} label(s) in tag map)") labels_with_counts = _format_labels( gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] if not non_tagmap: - msg.good("All labels present in tag map for language '{}'".format(nlp.lang)) + msg.good(f"All labels present in tag map for language '{nlp.lang}'") for label in non_tagmap: - msg.fail( - "Label '{}' not found in tag map for language '{}'".format( - label, nlp.lang - ) - ) + msg.fail(f"Label '{label}' not found in tag map for language '{nlp.lang}'") if "parser" in pipeline: has_low_data_warning = False @@ -366,21 +309,18 @@ def debug_data( # profile sentence length msg.info( - "Found {} sentence{} with an average length of {:.1f} words.".format( - gold_train_data["n_sents"], - "s" if len(train_dataset) > 1 else "", - gold_train_data["n_words"] / gold_train_data["n_sents"], - ) + f"Found {gold_train_data['n_sents']} sentence(s) with an average " + f"length of {gold_train_data['n_words'] / gold_train_data['n_sents']:.1f} words." ) # check for documents with multiple sentences sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"]) if sents_per_doc < 1.1: msg.warn( - "The training data contains {:.2f} sentences per " - "document. When there are very few documents containing more " - "than one sentence, the parser will not learn how to segment " - "longer texts into sentences.".format(sents_per_doc) + f"The training data contains {sents_per_doc:.2f} sentences per " + f"document. When there are very few documents containing more " + f"than one sentence, the parser will not learn how to segment " + f"longer texts into sentences." ) # profile labels @@ -391,32 +331,13 @@ def debug_data( labels_dev = [label for label in gold_dev_data["deps"]] if gold_train_unpreprocessed_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective train sentence{}".format( - gold_train_unpreprocessed_data["n_nonproj"], - "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "", - ) - ) + n_nonproj = gold_train_unpreprocessed_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective train sentence(s)") if gold_dev_data["n_nonproj"] > 0: - msg.info( - "Found {} nonprojective dev sentence{}".format( - gold_dev_data["n_nonproj"], - "s" if gold_dev_data["n_nonproj"] > 1 else "", - ) - ) - - msg.info( - "{} {} in train data".format( - len(labels_train_unpreprocessed), - "label" if len(labels_train) == 1 else "labels", - ) - ) - msg.info( - "{} {} in projectivized train data".format( - len(labels_train), "label" if len(labels_train) == 1 else "labels" - ) - ) - + n_nonproj = gold_dev_data["n_nonproj"] + msg.info(f"Found {n_nonproj} nonprojective dev sentence(s)") + msg.info(f"{labels_train_unpreprocessed} label(s) in train data") + msg.info(f"{len(labels_train)} label(s) in projectivized train data") labels_with_counts = _format_labels( gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) @@ -426,9 +347,8 @@ def debug_data( for label in gold_train_unpreprocessed_data["deps"]: if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: msg.warn( - "Low number of examples for label '{}' ({})".format( - label, gold_train_unpreprocessed_data["deps"][label] - ) + f"Low number of examples for label '{label}' " + f"({gold_train_unpreprocessed_data['deps'][label]})" ) has_low_data_warning = True @@ -437,22 +357,19 @@ def debug_data( for label in gold_train_data["deps"]: if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label: rare_projectivized_labels.append( - "{}: {}".format(label, str(gold_train_data["deps"][label])) + f"{label}: {gold_train_data['deps'][label]}" ) if len(rare_projectivized_labels) > 0: msg.warn( - "Low number of examples for {} label{} in the " - "projectivized dependency trees used for training. You may " - "want to projectivize labels such as punct before " - "training in order to improve parser performance.".format( - len(rare_projectivized_labels), - "s" if len(rare_projectivized_labels) > 1 else "", - ) + f"Low number of examples for {len(rare_projectivized_labels)} " + "label(s) in the projectivized dependency trees used for " + "training. You may want to projectivize labels such as punct " + "before training in order to improve parser performance." ) msg.warn( - "Projectivized labels with low numbers of examples: " - "{}".format("\n".join(rare_projectivized_labels)), + f"Projectivized labels with low numbers of examples: ", + ", ".join(rare_projectivized_labels), show=verbose, ) has_low_data_warning = True @@ -460,50 +377,44 @@ def debug_data( # labels only in train if set(labels_train) - set(labels_dev): msg.warn( - "The following labels were found only in the train data: " - "{}".format(", ".join(set(labels_train) - set(labels_dev))), + "The following labels were found only in the train data:", + ", ".join(set(labels_train) - set(labels_dev)), show=verbose, ) # labels only in dev if set(labels_dev) - set(labels_train): msg.warn( - "The following labels were found only in the dev data: " - + ", ".join(set(labels_dev) - set(labels_train)), + "The following labels were found only in the dev data:", + ", ".join(set(labels_dev) - set(labels_train)), show=verbose, ) if has_low_data_warning: msg.text( - "To train a parser, your data should include at " - "least {} instances of each label.".format(DEP_LABEL_THRESHOLD), + f"To train a parser, your data should include at " + f"least {DEP_LABEL_THRESHOLD} instances of each label.", show=verbose, ) # multiple root labels if len(gold_train_unpreprocessed_data["roots"]) > 1: msg.warn( - "Multiple root labels ({}) ".format( - ", ".join(gold_train_unpreprocessed_data["roots"]) - ) - + "found in training data. spaCy's parser uses a single root " - "label ROOT so this distinction will not be available." + f"Multiple root labels " + f"({', '.join(gold_train_unpreprocessed_data['roots'])}) " + f"found in training data. spaCy's parser uses a single root " + f"label ROOT so this distinction will not be available." ) # these should not happen, but just in case if gold_train_data["n_nonproj"] > 0: msg.fail( - "Found {} nonprojective projectivized train sentence{}".format( - gold_train_data["n_nonproj"], - "s" if gold_train_data["n_nonproj"] > 1 else "", - ) + f"Found {gold_train_data['n_nonproj']} nonprojective " + f"projectivized train sentence(s)" ) if gold_train_data["n_cycles"] > 0: msg.fail( - "Found {} projectivized train sentence{} with cycles".format( - gold_train_data["n_cycles"], - "s" if gold_train_data["n_cycles"] > 1 else "", - ) + f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles" ) msg.divider("Summary") @@ -511,36 +422,28 @@ def debug_data( warn_counts = msg.counts[MESSAGES.WARN] fail_counts = msg.counts[MESSAGES.FAIL] if good_counts: - msg.good( - "{} {} passed".format( - good_counts, "check" if good_counts == 1 else "checks" - ) - ) + msg.good(f"{good_counts} {'check' if good_counts == 1 else 'checks'} passed") if warn_counts: - msg.warn( - "{} {}".format(warn_counts, "warning" if warn_counts == 1 else "warnings") - ) - if fail_counts: - msg.fail("{} {}".format(fail_counts, "error" if fail_counts == 1 else "errors")) - + msg.warn(f"{warn_counts} {'warning' if warn_counts == 1 else 'warnings'}") if fail_counts: + msg.fail(f"{fail_counts} {'error' if fail_counts == 1 else 'errors'}") sys.exit(1) def _load_file(file_path, msg): file_name = file_path.parts[-1] if file_path.suffix == ".json": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_json(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data elif file_path.suffix == ".jsonl": - with msg.loading("Loading {}...".format(file_name)): + with msg.loading(f"Loading {file_name}..."): data = srsly.read_jsonl(file_path) - msg.good("Loaded {}".format(file_name)) + msg.good(f"Loaded {file_name}") return data msg.fail( - "Can't load file extension {}".format(file_path.suffix), + f"Can't load file extension {file_path.suffix}", "Expected .json or .jsonl", exits=1, ) @@ -604,14 +507,18 @@ def _compile_gold(examples, pipeline): def _format_labels(labels, counts=False): if counts: - return ", ".join(["'{}' ({})".format(l, c) for l, c in labels]) - return ", ".join(["'{}'".format(l) for l in labels]) + return ", ".join([f"'{l}' ({c})" for l, c in labels]) + return ", ".join([f"'{l}'" for l in labels]) def _get_examples_without_label(data, label): count = 0 for ex in data: - labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)] + labels = [ + label.split("-")[1] + for label in ex.gold.ner + if label not in ("O", "-", None) + ] if label not in labels: count += 1 return count diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 19f3e7860..7c87a582a 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import requests import os @@ -50,7 +47,7 @@ def download(model, direct=False, *pip_args): sys.exit(dl) msg.good( "Download and installation successful", - "You can now load the model via spacy.load('{}')".format(model_name), + f"You can now load the model via spacy.load('{model_name}')", ) # Only create symlink if the model is installed via a shortcut like 'en'. # There's no real advantage over an additional symlink for en_core_web_sm @@ -69,10 +66,10 @@ def download(model, direct=False, *pip_args): # message and loading instructions, even if linking fails. msg.warn( "Download successful but linking failed", - "Creating a shortcut link for '{}' didn't work (maybe you " - "don't have admin permissions?), but you can still load " - "the model via its full package name: " - "nlp = spacy.load('{}')".format(model, model_name), + f"Creating a shortcut link for '{model}' didn't work (maybe you " + f"don't have admin permissions?), but you can still load " + f"the model via its full package name: " + f"nlp = spacy.load('{model_name}')", ) # If a model is downloaded and then loaded within the same process, our # is_package check currently fails, because pkg_resources.working_set @@ -95,11 +92,11 @@ def get_json(url, desc): r = requests.get(url) if r.status_code != 200: msg.fail( - "Server error ({})".format(r.status_code), - "Couldn't fetch {}. Please find a model for your spaCy " - "installation (v{}), and download it manually. For more " - "details, see the documentation: " - "https://spacy.io/usage/models".format(desc, about.__version__), + f"Server error ({r.status_code})", + f"Couldn't fetch {desc}. Please find a model for your spaCy " + f"installation (v{about.__version__}), and download it manually. " + f"For more details, see the documentation: " + f"https://spacy.io/usage/models", exits=1, ) return r.json() @@ -111,7 +108,7 @@ def get_compatibility(): comp_table = get_json(about.__compatibility__, "compatibility table") comp = comp_table["spacy"] if version not in comp: - msg.fail("No compatible models found for v{} of spaCy".format(version), exits=1) + msg.fail(f"No compatible models found for v{version} of spaCy", exits=1) return comp[version] @@ -119,8 +116,8 @@ def get_version(model, comp): model = model.rsplit(".dev", 1)[0] if model not in comp: msg.fail( - "No compatible model found for '{}' " - "(spaCy v{}).".format(model, about.__version__), + f"No compatible model found for '{model}' " + f"(spaCy v{about.__version__}).", exits=1, ) return comp[model][0] diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index da8a714a7..de2cb4d09 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac from timeit import default_timer as timer from wasabi import msg @@ -79,7 +76,7 @@ def evaluate( deps=render_deps, ents=render_ents, ) - msg.good("Generated {} parses as HTML".format(displacy_limit), displacy_path) + msg.good(f"Generated {displacy_limit} parses as HTML", displacy_path) if return_scores: return scorer.scores diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 080d0dc77..060a38e78 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import platform from pathlib import Path from wasabi import msg import srsly -from ..compat import path2str, basestring_, unicode_ from .. import util from .. import about @@ -33,12 +29,12 @@ def info(model=None, markdown=False, silent=False): msg.fail("Can't find model meta.json", meta_path, exits=1) meta = srsly.read_json(meta_path) if model_path.resolve() != model_path: - meta["link"] = path2str(model_path) - meta["source"] = path2str(model_path.resolve()) + meta["link"] = str(model_path) + meta["source"] = str(model_path.resolve()) else: - meta["source"] = path2str(model_path) + meta["source"] = str(model_path) if not silent: - title = "Info about model '{}'".format(model) + title = f"Info about model '{model}'" model_meta = { k: v for k, v in meta.items() if k not in ("accuracy", "speed") } @@ -49,7 +45,7 @@ def info(model=None, markdown=False, silent=False): return meta data = { "spaCy version": about.__version__, - "Location": path2str(Path(__file__).parent.parent), + "Location": str(Path(__file__).parent.parent), "Platform": platform.platform(), "Python version": platform.python_version(), "Models": list_models(), @@ -84,9 +80,9 @@ def print_markdown(data, title=None): """ markdown = [] for key, value in data.items(): - if isinstance(value, basestring_) and Path(value).exists(): + if isinstance(value, str) and Path(value).exists(): continue - markdown.append("* **{}:** {}".format(key, unicode_(value))) + markdown.append(f"* **{key}:** {value}") if title: - print("\n## {}".format(title)) + print(f"\n## {title}") print("\n{}\n".format("\n".join(markdown))) diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 3fa0cc890..c3ef5267c 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import math from tqdm import tqdm @@ -91,8 +88,7 @@ def init_model( vec_added = len(nlp.vocab.vectors) lex_added = len(nlp.vocab) msg.good( - "Sucessfully compiled vocab", - "{} entries, {} vectors".format(lex_added, vec_added), + "Sucessfully compiled vocab", f"{lex_added} entries, {vec_added} vectors", ) if not output_dir.exists(): output_dir.mkdir() @@ -177,9 +173,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors, name=None): nlp.vocab.vectors.add(lex.orth, row=lex.rank) else: if vectors_loc: - with msg.loading("Reading vectors from {}".format(vectors_loc)): + with msg.loading(f"Reading vectors from {vectors_loc}"): vectors_data, vector_keys = read_vectors(vectors_loc) - msg.good("Loaded vectors from {}".format(vectors_loc)) + msg.good(f"Loaded vectors from {vectors_loc}") else: vectors_data, vector_keys = (None, None) if vector_keys is not None: diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 8117829b5..df24adc23 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -1,11 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac from pathlib import Path from wasabi import msg -from ..compat import symlink_to, path2str +from ..compat import symlink_to from .. import util @@ -27,23 +24,23 @@ def link(origin, link_name, force=False, model_path=None): if not model_path.exists(): msg.fail( "Can't locate model data", - "The data should be located in {}".format(path2str(model_path)), + f"The data should be located in {model_path}", exits=1, ) data_path = util.get_data_path() if not data_path or not data_path.exists(): spacy_loc = Path(__file__).parent.parent msg.fail( - "Can't find the spaCy data path to create model symlink", - "Make sure a directory `/data` exists within your spaCy " - "installation and try again. The data directory should be located " - "here:".format(path=spacy_loc), + f"Can't find the spaCy data path to create model symlink", + f"Make sure a directory `/data` exists within your spaCy " + f"installation and try again. The data directory should be located " + f"here: {spacy_loc}", exits=1, ) link_path = util.get_data_path() / link_name if link_path.is_symlink() and not force: msg.fail( - "Link '{}' already exists".format(link_name), + f"Link '{link_name}' already exists", "To overwrite an existing link, use the --force flag", exits=1, ) @@ -54,18 +51,18 @@ def link(origin, link_name, force=False, model_path=None): elif link_path.exists(): # does it exist otherwise? # NB: Check this last because valid symlinks also "exist". msg.fail( - "Can't overwrite symlink '{}'".format(link_name), + f"Can't overwrite symlink '{link_name}'", "This can happen if your data directory contains a directory or " "file of the same name.", exits=1, ) - details = "%s --> %s" % (path2str(model_path), path2str(link_path)) + details = f"{model_path} --> {link_path}" try: symlink_to(link_path, model_path) except: # noqa: E722 # This is quite dirty, but just making sure other errors are caught. msg.fail( - "Couldn't link model to '{}'".format(link_name), + f"Couldn't link model to '{link_name}'", "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv. You can still import the model as a module and " @@ -74,4 +71,4 @@ def link(origin, link_name, force=False, model_path=None): msg.text(details) raise msg.good("Linking successful", details) - msg.text("You can now load the model via spacy.load('{}')".format(link_name)) + msg.text(f"You can now load the model via spacy.load('{link_name}')") diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 8ed92259c..8830a0ca2 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - import plac import shutil from pathlib import Path from wasabi import msg, get_raw_input import srsly -from ..compat import path2str from .. import util from .. import about @@ -47,7 +43,7 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals for key in ("lang", "name", "version"): if key not in meta or meta[key] == "": msg.fail( - "No '{}' setting found in meta.json".format(key), + f"No '{key}' setting found in meta.json", "This setting is required to build your package.", exits=1, ) @@ -58,22 +54,21 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals if package_path.exists(): if force: - shutil.rmtree(path2str(package_path)) + shutil.rmtree(str(package_path)) else: msg.fail( "Package directory already exists", "Please delete the directory and try again, or use the " - "`--force` flag to overwrite existing " - "directories.".format(path=path2str(package_path)), + "`--force` flag to overwrite existing directories.", exits=1, ) Path.mkdir(package_path, parents=True) - shutil.copytree(path2str(input_path), path2str(package_path / model_name_v)) + shutil.copytree(str(input_path), str(package_path / model_name_v)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(package_path / "__init__.py", TEMPLATE_INIT) - msg.good("Successfully created package '{}'".format(model_name_v), main_path) + msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.text("To build the package, run `python setup.py sdist` in this directory.") @@ -118,9 +113,6 @@ def generate_meta(model_path, existing_meta, msg): TEMPLATE_SETUP = """ #!/usr/bin/env python -# coding: utf8 -from __future__ import unicode_literals - import io import json from os import path, walk @@ -190,9 +182,6 @@ include meta.json TEMPLATE_INIT = """ -# coding: utf8 -from __future__ import unicode_literals - from pathlib import Path from spacy.util import load_model_from_init_py, get_model_meta diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 68038bc5c..75840923e 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import print_function, unicode_literals - import plac import random import numpy @@ -154,9 +151,9 @@ def pretrain( msg.text("Reading input text from stdin...") texts = srsly.read_jsonl("-") - with msg.loading("Loading model '{}'...".format(vectors_model)): + with msg.loading(f"Loading model '{vectors_model}'..."): nlp = util.load_model(vectors_model) - msg.good("Loaded model '{}'".format(vectors_model)) + msg.good(f"Loaded model '{vectors_model}'") pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name model = create_pretraining_model( nlp, @@ -173,7 +170,7 @@ def pretrain( # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Parse the epoch number from the given weight file model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) if model_name: @@ -221,7 +218,9 @@ def pretrain( skip_counter = 0 for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( - util.minibatch_by_words((Example(doc=text) for text in texts), size=batch_size) + util.minibatch_by_words( + (Example(doc=text) for text in texts), size=batch_size + ) ): docs, count = make_docs( nlp, @@ -246,7 +245,7 @@ def pretrain( # Reshuffle the texts if texts were loaded from a file random.shuffle(texts) if skip_counter > 0: - msg.warn("Skipped {count} empty values".format(count=str(skip_counter))) + msg.warn(f"Skipped {skip_counter} empty values") msg.good("Successfully finished pretrain") diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py index 4ee72fc23..f3df0817d 100644 --- a/spacy/cli/profile.py +++ b/spacy/cli/profile.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac import tqdm from pathlib import Path @@ -34,11 +31,11 @@ def profile(model, inputs=None, n_texts=10000): with msg.loading("Loading IMDB dataset via Thinc..."): imdb_train, _ = thinc.extra.datasets.imdb() inputs, _ = zip(*imdb_train) - msg.info("Loaded IMDB dataset and using {} examples".format(n_inputs)) + msg.info(f"Loaded IMDB dataset and using {n_inputs} examples") inputs = inputs[:n_inputs] - with msg.loading("Loading model '{}'...".format(model)): + with msg.loading(f"Loading model '{model}'..."): nlp = load_model(model) - msg.good("Loaded model '{}'".format(model)) + msg.good(f"Loaded model '{model}'") texts = list(itertools.islice(inputs, n_texts)) cProfile.runctx("parse_texts(nlp, texts)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") @@ -60,7 +57,7 @@ def _read_inputs(loc, msg): input_path = Path(loc) if not input_path.exists() or not input_path.is_file(): msg.fail("Not a valid input data file", loc, exits=1) - msg.info("Using data from {}".format(input_path.parts[-1])) + msg.info(f"Using data from {input_path.parts[-1]}") file_ = input_path.open() for line in file_: data = srsly.json_loads(line) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index daa90f022..e8662a101 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals, division, print_function - import plac import os import tqdm @@ -12,12 +9,10 @@ import srsly from wasabi import msg import contextlib import random -from collections import OrderedDict from .._ml import create_default_optimizer from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..gold import GoldCorpus -from ..compat import path2str from .. import util from .. import about @@ -148,14 +143,14 @@ def train( # the model and make sure the pipeline matches the pipeline setting. If # training starts from a blank model, intitalize the language class. pipeline = [p.strip() for p in pipeline.split(",")] - msg.text("Training pipeline: {}".format(pipeline)) + msg.text(f"Training pipeline: {pipeline}") if base_model: - msg.text("Starting with base model '{}'".format(base_model)) + msg.text(f"Starting with base model '{base_model}'") nlp = util.load_model(base_model) if nlp.lang != lang: msg.fail( - "Model language ('{}') doesn't match language specified as " - "`lang` argument ('{}') ".format(nlp.lang, lang), + f"Model language ('{nlp.lang}') doesn't match language " + f"specified as `lang` argument ('{lang}') ", exits=1, ) nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline]) @@ -187,15 +182,13 @@ def train( } if base_cfg != pipe_cfg: msg.fail( - "The base textcat model configuration does" - "not match the provided training options. " - "Existing cfg: {}, provided cfg: {}".format( - base_cfg, pipe_cfg - ), + f"The base textcat model configuration does" + f"not match the provided training options. " + f"Existing cfg: {base_cfg}, provided cfg: {pipe_cfg}", exits=1, ) else: - msg.text("Starting with blank model '{}'".format(lang)) + msg.text(f"Starting with blank model '{lang}'") lang_cls = util.get_lang_class(lang) nlp = lang_cls() for pipe in pipeline: @@ -215,7 +208,7 @@ def train( nlp.vocab.morphology.tag_map.update(tag_map) if vectors: - msg.text("Loading vector from model '{}'".format(vectors)) + msg.text(f"Loading vector from model '{vectors}'") _load_vectors(nlp, vectors) # Multitask objectives @@ -224,15 +217,15 @@ def train( if multitasks: if pipe_name not in pipeline: msg.fail( - "Can't use multitask objective without '{}' in the " - "pipeline".format(pipe_name) + f"Can't use multitask objective without '{pipe_name}' in " + f"the pipeline" ) pipe = nlp.get_pipe(pipe_name) for objective in multitasks.split(","): pipe.add_multitask_objective(objective) # Prepare training corpus - msg.text("Counting training words (limit={})".format(n_examples)) + msg.text(f"Counting training words (limit={n_examples})") corpus = GoldCorpus(train_path, dev_path, limit=n_examples) n_train_words = corpus.count_train() @@ -248,22 +241,22 @@ def train( # Load in pretrained weights if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) - msg.text("Loaded pretrained tok2vec for: {}".format(components)) + msg.text(f"Loaded pretrained tok2vec for: {components}") # Verify textcat config if "textcat" in pipeline: textcat_labels = nlp.get_pipe("textcat").cfg["labels"] if textcat_positive_label and textcat_positive_label not in textcat_labels: msg.fail( - "The textcat_positive_label (tpl) '{}' does not match any " - "label in the training data.".format(textcat_positive_label), + f"The textcat_positive_label (tpl) '{textcat_positive_label}' " + f"does not match any label in the training data.", exits=1, ) if textcat_positive_label and len(textcat_labels) != 2: msg.fail( - "A textcat_positive_label (tpl) '{}' was provided for training " - "data that does not appear to be a binary classification " - "problem with two labels.".format(textcat_positive_label), + "A textcat_positive_label (tpl) '{textcat_positive_label}' was " + "provided for training data that does not appear to be a " + "binary classification problem with two labels.", exits=1, ) train_data = corpus.train_data( @@ -302,20 +295,20 @@ def train( break if base_model and set(textcat_labels) != train_labels: msg.fail( - "Cannot extend textcat model using data with different " - "labels. Base model labels: {}, training data labels: " - "{}.".format(textcat_labels, list(train_labels)), + f"Cannot extend textcat model using data with different " + f"labels. Base model labels: {textcat_labels}, training data " + f"labels: {list(train_labels)}", exits=1, ) if textcat_multilabel: msg.text( - "Textcat evaluation score: ROC AUC score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: ROC AUC score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) elif textcat_positive_label and len(textcat_labels) == 2: msg.text( - "Textcat evaluation score: F1-score for the " - "label '{}'".format(textcat_positive_label) + f"Textcat evaluation score: F1-score for the " + f"label '{textcat_positive_label}'" ) elif len(textcat_labels) > 1: if len(textcat_labels) == 2: @@ -325,8 +318,8 @@ def train( "an evaluation on the positive class." ) msg.text( - "Textcat evaluation score: F1-score macro-averaged across " - "the labels '{}'".format(", ".join(textcat_labels)) + f"Textcat evaluation score: F1-score macro-averaged across " + f"the labels '{', '.join(textcat_labels)}'" ) else: msg.fail( @@ -471,8 +464,8 @@ def train( for cat, cat_score in textcats_per_cat.items(): if cat_score.get("roc_auc_score", 0) < 0: msg.warn( - "Textcat ROC AUC score is undefined due to " - "only one value in label '{}'.".format(cat) + f"Textcat ROC AUC score is undefined due to " + f"only one value in label '{cat}'." ) msg.row(progress, **row_settings) # Early stopping @@ -485,12 +478,10 @@ def train( best_score = current_score if iter_since_best >= n_early_stopping: msg.text( - "Early stopping, best iteration " - "is: {}".format(i - iter_since_best) + f"Early stopping, best iteration is: {i - iter_since_best}" ) msg.text( - "Best score = {}; Final iteration " - "score = {}".format(best_score, current_score) + f"Best score = {best_score}; Final iteration score = {current_score}" ) break finally: @@ -560,11 +551,11 @@ def _collate_best_model(meta, output_path, components): for component in components: bests[component] = _find_best(output_path, component) best_dest = output_path / "model-best" - shutil.copytree(path2str(output_path / "model-final"), path2str(best_dest)) + shutil.copytree(str(output_path / "model-final"), str(best_dest)) for component, best_component_src in bests.items(): - shutil.rmtree(path2str(best_dest / component)) + shutil.rmtree(str(best_dest / component)) shutil.copytree( - path2str(best_component_src / component), path2str(best_dest / component) + str(best_component_src / component), str(best_dest / component) ) accs = srsly.read_json(best_component_src / "accuracy.json") for metric in _get_metrics(component): @@ -627,10 +618,8 @@ def _configure_training_output(pipeline, use_gpu, has_beam_widths): if has_beam_widths: row_head.insert(1, "Beam W.") # remove duplicates - row_head_dict = OrderedDict() - row_head_dict.update({k: 1 for k in row_head}) - output_stats_dict = OrderedDict() - output_stats_dict.update({k: 1 for k in output_stats}) + row_head_dict = {k: 1 for k in row_head} + output_stats_dict = {k: 1 for k in output_stats} return row_head_dict.keys(), output_stats_dict.keys() diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py index 93abad6f6..b4d217f2f 100644 --- a/spacy/cli/validate.py +++ b/spacy/cli/validate.py @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - from pathlib import Path import sys import requests import srsly from wasabi import msg -from ..compat import path2str from ..util import get_data_path from .. import about @@ -21,7 +17,7 @@ def validate(): r = requests.get(about.__compatibility__) if r.status_code != 200: msg.fail( - "Server error ({})".format(r.status_code), + f"Server error ({r.status_code})", "Couldn't fetch compatibility table.", exits=1, ) @@ -32,7 +28,7 @@ def validate(): current_compat = compat.get(version) if not current_compat: msg.fail( - "Can't find spaCy v{} in compatibility table".format(version), + f"Can't find spaCy v{version} in compatibility table", about.__compatibility__, exits=1, ) @@ -52,8 +48,8 @@ def validate(): update_models = [m for m in incompat_models if m in current_compat] spacy_dir = Path(__file__).parent.parent - msg.divider("Installed models (spaCy v{})".format(about.__version__)) - msg.info("spaCy installation: {}".format(path2str(spacy_dir))) + msg.divider(f"Installed models (spaCy v{about.__version__})") + msg.info(f"spaCy installation: {spacy_dir}") if model_links or model_pkgs: header = ("TYPE", "NAME", "MODEL", "VERSION", "") @@ -72,15 +68,15 @@ def validate(): print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") if na_models: msg.text( - "The following models are not available for spaCy " - "v{}: {}".format(about.__version__, ", ".join(na_models)) + f"The following models are not available for spaCy " + f"v{about.__version__}: {', '.join(na_models)}" ) if incompat_links: msg.text( - "You may also want to overwrite the incompatible links using the " - "`python -m spacy link` command with `--force`, or remove them " - "from the data directory. " - "Data path: {path}".format(path=path2str(get_data_path())) + f"You may also want to overwrite the incompatible links using the " + f"`python -m spacy link` command with `--force`, or remove them " + f"from the data directory. " + f"Data path: {get_data_path()}" ) if incompat_models or incompat_links: sys.exit(1) @@ -128,7 +124,7 @@ def get_model_row(compat, name, data, msg, model_type="package"): version = msg.text(data["version"], color="green", no_print=True) else: version = msg.text(data["version"], color="red", no_print=True) - comp = "--> {}".format(compat.get(data["name"], ["n/a"])[0]) + comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" return (model_type, name, data["name"], version, comp) diff --git a/spacy/compat.py b/spacy/compat.py index 0ea31c6b3..8cb08ae09 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -1,4 +1,3 @@ -# coding: utf8 """ Helpers for Python and platform compatibility. To distinguish them from the builtin functions, replacement functions are suffixed with an underscore, @@ -6,13 +5,8 @@ e.g. `unicode_`. DOCS: https://spacy.io/api/top-level#compat """ -from __future__ import unicode_literals - import os import sys -import itertools -import ast -import types from thinc.neural.util import copy_array @@ -46,45 +40,11 @@ copy_reg = copy_reg CudaStream = CudaStream cupy = cupy copy_array = copy_array -izip = getattr(itertools, "izip", zip) is_windows = sys.platform.startswith("win") is_linux = sys.platform.startswith("linux") is_osx = sys.platform == "darwin" -# See: https://github.com/benjaminp/six/blob/master/six.py -is_python2 = sys.version_info[0] == 2 -is_python3 = sys.version_info[0] == 3 -is_python_pre_3_5 = is_python2 or (is_python3 and sys.version_info[1] < 5) - -if is_python2: - bytes_ = str - unicode_ = unicode # noqa: F821 - basestring_ = basestring # noqa: F821 - input_ = raw_input # noqa: F821 - path2str = lambda path: str(path).decode("utf8") - class_types = (type, types.ClassType) - -elif is_python3: - bytes_ = bytes - unicode_ = str - basestring_ = str - input_ = input - path2str = lambda path: str(path) - class_types = (type, types.ClassType) if is_python_pre_3_5 else type - - -def b_to_str(b_str): - """Convert a bytes object to a string. - - b_str (bytes): The object to convert. - RETURNS (unicode): The converted string. - """ - if is_python2: - return b_str - # Important: if no encoding is set, string becomes "b'...'" - return str(b_str, encoding="utf8") - def symlink_to(orig, dest): """Create a symlink. Used for model shortcut links. @@ -95,9 +55,7 @@ def symlink_to(orig, dest): if is_windows: import subprocess - subprocess.check_call( - ["mklink", "/d", path2str(orig), path2str(dest)], shell=True - ) + subprocess.check_call(["mklink", "/d", str(orig), str(dest)], shell=True) else: orig.symlink_to(dest) @@ -108,19 +66,17 @@ def symlink_remove(link): link (unicode / Path): The path to the symlink. """ # https://stackoverflow.com/q/26554135/6400719 - if os.path.isdir(path2str(link)) and is_windows: + if os.path.isdir(str(link)) and is_windows: # this should only be on Py2.7 and windows - os.rmdir(path2str(link)) + os.rmdir(str(link)) else: - os.unlink(path2str(link)) + os.unlink(str(link)) -def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): +def is_config(windows=None, linux=None, osx=None, **kwargs): """Check if a specific configuration of Python version and operating system matches the user's setup. Mostly used to display targeted error messages. - python2 (bool): spaCy is executed with Python 2.x. - python3 (bool): spaCy is executed with Python 3.x. windows (bool): spaCy is executed on Windows. linux (bool): spaCy is executed on Linux. osx (bool): spaCy is executed on OS X or macOS. @@ -129,53 +85,7 @@ def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): DOCS: https://spacy.io/api/top-level#compat.is_config """ return ( - python2 in (None, is_python2) - and python3 in (None, is_python3) - and windows in (None, is_windows) + windows in (None, is_windows) and linux in (None, is_linux) and osx in (None, is_osx) ) - - -def import_file(name, loc): - """Import module from a file. Used to load models from a directory. - - name (unicode): Name of module to load. - loc (unicode / Path): Path to the file. - RETURNS: The loaded module. - """ - loc = path2str(loc) - if is_python_pre_3_5: - import imp - - return imp.load_source(name, loc) - else: - import importlib.util - - spec = importlib.util.spec_from_file_location(name, str(loc)) - module = importlib.util.module_from_spec(spec) - spec.loader.exec_module(module) - return module - - -def unescape_unicode(string): - """Python2.7's re module chokes when compiling patterns that have ranges - between escaped unicode codepoints if the two codepoints are unrecognised - in the unicode database. For instance: - - re.compile('[\\uAA77-\\uAA79]').findall("hello") - - Ends up matching every character (on Python 2). This problem doesn't occur - if we're dealing with unicode literals. - """ - if string is None: - return string - # We only want to unescape the unicode, so we first must protect the other - # backslashes. - string = string.replace("\\", "\\\\") - # Now we remove that protection for the unicode. - string = string.replace("\\\\u", "\\u") - string = string.replace("\\\\U", "\\U") - # Now we unescape by evaling the string with the AST. This can't execute - # code -- it only does the representational level. - return ast.literal_eval("u'''" + string + "'''") diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index d2ef21dbd..d804757ef 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -1,15 +1,11 @@ -# coding: utf8 """ spaCy's built in visualization suite for dependencies and named entities. DOCS: https://spacy.io/api/top-level#displacy USAGE: https://spacy.io/usage/visualizers """ -from __future__ import unicode_literals - from .render import DependencyRenderer, EntityRenderer from ..tokens import Doc, Span -from ..compat import b_to_str from ..errors import Errors, Warnings, user_warning from ..util import is_in_jupyter @@ -92,20 +88,20 @@ def serve( render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) - print("\nUsing the '{}' visualizer".format(style)) - print("Serving on http://{}:{} ...\n".format(host, port)) + print(f"\nUsing the '{style}' visualizer") + print(f"Serving on http://{host}:{port} ...\n") try: httpd.serve_forever() except KeyboardInterrupt: - print("Shutting down server on port {}.".format(port)) + print(f"Shutting down server on port {port}.") finally: httpd.server_close() def app(environ, start_response): # Headers and status need to be bytes in Python 2, see #1227 - headers = [(b_to_str(b"Content-type"), b_to_str(b"text/html; charset=utf-8"))] - start_response(b_to_str(b"200 OK"), headers) + headers = [("Content-type", "text/html; charset=utf-8")] + start_response("200 OK", headers) res = _html["parsed"].encode(encoding="utf-8") return [res] diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index d6e33437b..7ca1eebb7 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS @@ -55,7 +52,7 @@ class DependencyRenderer(object): settings = p.get("settings", {}) self.direction = settings.get("direction", DEFAULT_DIR) self.lang = settings.get("lang", DEFAULT_LANG) - render_id = "{}-{}".format(id_prefix, i) + render_id = f"{id_prefix}-{i}" svg = self.render_svg(render_id, p["words"], p["arcs"]) rendered.append(svg) if page: diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py index ade75d1d6..d6970aa2f 100644 --- a/spacy/displacy/templates.py +++ b/spacy/displacy/templates.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Setting explicit height and max-width: none on the SVG is required for # Jupyter to render it properly in a cell diff --git a/spacy/errors.py b/spacy/errors.py index 3dab4e1fb..81747b33b 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import os import warnings import inspect @@ -12,7 +9,7 @@ def add_codes(err_cls): class ErrorsWithCodes(object): def __getattribute__(self, code): msg = getattr(err_cls, code) - return "[{code}] {msg}".format(code=code, msg=msg) + return f"[{code}] {msg}" return ErrorsWithCodes() @@ -98,8 +95,6 @@ class Warnings(object): "you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. " "If this is surprising, make sure you have the spacy-lookups-data " "package installed.") - W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. " - "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " @@ -550,6 +545,7 @@ class Errors(object): E999 = ("Encountered an unexpected format for the dictionary holding " "gold annotations: {gold_dict}") + @add_codes class TempErrors(object): T003 = ("Resizing pretrained Tagger models is not currently supported.") @@ -573,10 +569,10 @@ class MatchPatternError(ValueError): errors (dict): Validation errors (sequence of strings) mapped to pattern ID, i.e. the index of the added pattern. """ - msg = "Invalid token patterns for matcher rule '{}'\n".format(key) + msg = f"Invalid token patterns for matcher rule '{key}'\n" for pattern_idx, error_msgs in errors.items(): - pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs]) - msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors) + pattern_errors = "\n".join([f"- {e}" for e in error_msgs]) + msg += f"\nPattern {pattern_idx}:\n{pattern_errors}\n" ValueError.__init__(self, msg) diff --git a/spacy/glossary.py b/spacy/glossary.py index 44a8277da..5e7e531a9 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def explain(term): """Get a description for a given POS tag, dependency label or entity type. diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 0374825dc..e3af40d4d 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,7 +1,4 @@ # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals, print_function - import re import random import numpy @@ -14,7 +11,6 @@ import srsly from .syntax import nonproj from .tokens import Doc, Span from .errors import Errors, AlignmentError, user_warning, Warnings -from .compat import path2str, basestring_ from . import util @@ -157,7 +153,7 @@ class GoldCorpus(object): self.write_msgpack(self.tmp_dir / "dev", dev, limit=self.limit) def __del__(self): - shutil.rmtree(path2str(self.tmp_dir)) + shutil.rmtree(self.tmp_dir) @staticmethod def write_msgpack(directory, examples, limit=0): @@ -167,7 +163,7 @@ class GoldCorpus(object): for i, example in enumerate(examples): ex_dict = example.to_dict() text = example.text - srsly.write_msgpack(directory / "{}.msg".format(i), (text, ex_dict)) + srsly.write_msgpack(directory / f"{i}.msg", (text, ex_dict)) n += 1 if limit and n >= limit: break @@ -221,7 +217,7 @@ class GoldCorpus(object): examples = [Example.from_dict(ex_dict, doc=text)] else: supported = ("json", "jsonl", "msg") - raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported)) + raise ValueError(Errors.E124.format(path=loc, formats=supported)) for example in examples: yield example i += 1 @@ -862,7 +858,7 @@ cdef class Example: converted_examples = [] for ex in examples: # convert string to Doc to Example - if isinstance(ex, basestring_): + if isinstance(ex, str): if keep_raw_text: converted_examples.append(Example(doc=ex)) else: @@ -876,7 +872,7 @@ cdef class Example: doc, gold = ex gold_dict = {} # convert string to Doc - if isinstance(doc, basestring_) and not keep_raw_text: + if isinstance(doc, str) and not keep_raw_text: doc = make_doc(doc) # convert dict to GoldParse if isinstance(gold, dict): @@ -988,7 +984,7 @@ cdef class GoldParse: # Translate the None values to '-', to make processing easier. # See Issue #2603 entities = [(ent if ent is not None else "-") for ent in entities] - if not isinstance(entities[0], basestring_): + if not isinstance(entities[0], str): # Assume we have entities specified by character offset. entities = biluo_tags_from_offsets(doc, entities) @@ -1107,7 +1103,7 @@ cdef class GoldParse: cycle = nonproj.contains_cycle(self.heads) if cycle is not None: raise ValueError(Errors.E069.format(cycle=cycle, - cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), + cycle_tokens=" ".join([f"'{self.words[tok_id]}'" for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) def __len__(self): diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 63eb41b42..1129fa860 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,22 +1,17 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from spacy.errors import Errors, Warnings, user_warning - from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap - from cpython.exc cimport PyErr_SetFromErrno - from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t - -from .typedefs cimport hash_t - from os import path from libcpp.vector cimport vector +from .typedefs cimport hash_t +from .errors import Errors, Warnings, user_warning + cdef class Candidate: """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved @@ -584,5 +579,3 @@ cdef class Reader: cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status - - diff --git a/spacy/lang/af/__init__.py b/spacy/lang/af/__init__.py index 90ea324f0..0da123419 100644 --- a/spacy/lang/af/__init__.py +++ b/spacy/lang/af/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/af/stop_words.py b/spacy/lang/af/stop_words.py index 2b3bcc019..dfd144de9 100644 --- a/spacy/lang/af/stop_words.py +++ b/spacy/lang/af/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-af diff --git a/spacy/lang/ar/__init__.py b/spacy/lang/ar/__init__.py index c120703f6..6a1a8af3a 100644 --- a/spacy/lang/ar/__init__.py +++ b/spacy/lang/ar/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ar/examples.py b/spacy/lang/ar/examples.py index 2a10f4fcc..a51bb9ded 100644 --- a/spacy/lang/ar/examples.py +++ b/spacy/lang/ar/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ar/lex_attrs.py b/spacy/lang/ar/lex_attrs.py index 19e7aef8a..54ad7a8c3 100644 --- a/spacy/lang/ar/lex_attrs.py +++ b/spacy/lang/ar/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM _num_words = set( diff --git a/spacy/lang/ar/punctuation.py b/spacy/lang/ar/punctuation.py index 6625c5475..f30204c02 100644 --- a/spacy/lang/ar/punctuation.py +++ b/spacy/lang/ar/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/ar/stop_words.py b/spacy/lang/ar/stop_words.py index de2fc7443..f4da54dda 100644 --- a/spacy/lang/ar/stop_words.py +++ b/spacy/lang/ar/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ من diff --git a/spacy/lang/ar/tokenizer_exceptions.py b/spacy/lang/ar/tokenizer_exceptions.py index 030daecd5..a11f3b43a 100644 --- a/spacy/lang/ar/tokenizer_exceptions.py +++ b/spacy/lang/ar/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/bg/__init__.py b/spacy/lang/bg/__init__.py index 9b4c647e3..437feb9ed 100644 --- a/spacy/lang/bg/__init__.py +++ b/spacy/lang/bg/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/bg/examples.py b/spacy/lang/bg/examples.py index b08b8926d..a6d40da1a 100644 --- a/spacy/lang/bg/examples.py +++ b/spacy/lang/bg/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bg/stop_words.py b/spacy/lang/bg/stop_words.py index e7c65cbc2..45a252bc9 100644 --- a/spacy/lang/bg/stop_words.py +++ b/spacy/lang/bg/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Alir3z4/stop-words diff --git a/spacy/lang/bn/__init__.py b/spacy/lang/bn/__init__.py index e70232552..901676554 100644 --- a/spacy/lang/bn/__init__.py +++ b/spacy/lang/bn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/bn/examples.py b/spacy/lang/bn/examples.py index 2d5bdb238..051e59d84 100644 --- a/spacy/lang/bn/examples.py +++ b/spacy/lang/bn/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/bn/morph_rules.py b/spacy/lang/bn/morph_rules.py index 21a76c7e6..44d6108e9 100644 --- a/spacy/lang/bn/morph_rules.py +++ b/spacy/lang/bn/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/bn/punctuation.py b/spacy/lang/bn/punctuation.py index f624b4ba4..becfe8d2a 100644 --- a/spacy/lang/bn/punctuation.py +++ b/spacy/lang/bn/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS from ..char_classes import ALPHA_LOWER, ALPHA, HYPHENS, CONCAT_QUOTES, UNITS diff --git a/spacy/lang/bn/stop_words.py b/spacy/lang/bn/stop_words.py index 6c9967df8..6bcd06b37 100644 --- a/spacy/lang/bn/stop_words.py +++ b/spacy/lang/bn/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/bn/tag_map.py b/spacy/lang/bn/tag_map.py index 1efb35858..36d69ccf9 100644 --- a/spacy/lang/bn/tag_map.py +++ b/spacy/lang/bn/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import CCONJ, NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SYM diff --git a/spacy/lang/bn/tokenizer_exceptions.py b/spacy/lang/bn/tokenizer_exceptions.py index 32acb1730..18e313a25 100644 --- a/spacy/lang/bn/tokenizer_exceptions.py +++ b/spacy/lang/bn/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding=utf-8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py index 6d4c00a6b..a1ff2f2df 100644 --- a/spacy/lang/ca/__init__.py +++ b/spacy/lang/ca/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ca/examples.py b/spacy/lang/ca/examples.py index 3020ee707..3fbf1fb0a 100644 --- a/spacy/lang/ca/examples.py +++ b/spacy/lang/ca/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ca/lex_attrs.py b/spacy/lang/ca/lex_attrs.py index 6314efa92..be8b7a6ea 100644 --- a/spacy/lang/ca/lex_attrs.py +++ b/spacy/lang/ca/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py index 4439376c8..d50b75589 100644 --- a/spacy/lang/ca/punctuation.py +++ b/spacy/lang/ca/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/ca/stop_words.py b/spacy/lang/ca/stop_words.py index a803db2a5..1a87b2f9d 100644 --- a/spacy/lang/ca/stop_words.py +++ b/spacy/lang/ca/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a abans ací ah així això al aleshores algun alguna algunes alguns alhora allà allí allò diff --git a/spacy/lang/ca/tag_map.py b/spacy/lang/ca/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/ca/tag_map.py +++ b/spacy/lang/ca/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py index d95e5e626..5a9d9055a 100644 --- a/spacy/lang/ca/tokenizer_exceptions.py +++ b/spacy/lang/ca/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index 2c8823867..73f48e49a 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - split_chars = lambda char: list(char.strip().split(" ")) merge_chars = lambda char: char.strip().replace(" ", "|") group_chars = lambda char: char.strip().replace(" ", "") diff --git a/spacy/lang/cs/__init__.py b/spacy/lang/cs/__init__.py index 5b1397ba2..a27e3339d 100644 --- a/spacy/lang/cs/__init__.py +++ b/spacy/lang/cs/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/cs/stop_words.py b/spacy/lang/cs/stop_words.py index 59d3c102e..e8171a7e5 100644 --- a/spacy/lang/cs/stop_words.py +++ b/spacy/lang/cs/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Alir3z4/stop-words diff --git a/spacy/lang/da/__init__.py b/spacy/lang/da/__init__.py index ac8c04954..2828c014b 100644 --- a/spacy/lang/da/__init__.py +++ b/spacy/lang/da/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/examples.py b/spacy/lang/da/examples.py index b535191a1..e5c6448f0 100644 --- a/spacy/lang/da/examples.py +++ b/spacy/lang/da/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/da/lex_attrs.py b/spacy/lang/da/lex_attrs.py index 9fefc1eba..403af686c 100644 --- a/spacy/lang/da/lex_attrs.py +++ b/spacy/lang/da/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/da/morph_rules.py b/spacy/lang/da/morph_rules.py index 7ffe2ac6f..06704f482 100644 --- a/spacy/lang/da/morph_rules.py +++ b/spacy/lang/da/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Source: Danish Universal Dependencies and http://fjern-uv.dk/pronom.php diff --git a/spacy/lang/da/norm_exceptions.py b/spacy/lang/da/norm_exceptions.py index dbffdb88b..c689500f4 100644 --- a/spacy/lang/da/norm_exceptions.py +++ b/spacy/lang/da/norm_exceptions.py @@ -1,10 +1,7 @@ -# coding: utf8 """ Special-case rules for normalizing tokens to improve the model's predictions. For example 'mysterium' vs 'mysterie' and similar. """ -from __future__ import unicode_literals - # Sources: # 1: https://dsn.dk/retskrivning/om-retskrivningsordbogen/mere-om-retskrivningsordbogen-2012/endrede-stave-og-ordformer/ diff --git a/spacy/lang/da/punctuation.py b/spacy/lang/da/punctuation.py index b6b852c55..e050ab7aa 100644 --- a/spacy/lang/da/punctuation.py +++ b/spacy/lang/da/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/da/stop_words.py b/spacy/lang/da/stop_words.py index 48de0c7ca..05b2084dd 100644 --- a/spacy/lang/da/stop_words.py +++ b/spacy/lang/da/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: Handpicked by Jens Dahl Møllerhøj. STOP_WORDS = set( diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index d669fb981..64eba819f 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,11 +1,7 @@ -# encoding: utf8 """ Tokenizer Exceptions. Source: https://forkortelse.dk/ and various others. """ - -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT diff --git a/spacy/lang/de/__init__.py b/spacy/lang/de/__init__.py index 1412f033a..8478b6f23 100644 --- a/spacy/lang/de/__init__.py +++ b/spacy/lang/de/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/de/examples.py b/spacy/lang/de/examples.py index 0c64a693a..530ece629 100644 --- a/spacy/lang/de/examples.py +++ b/spacy/lang/de/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/de/norm_exceptions.py b/spacy/lang/de/norm_exceptions.py index 3dbd4c7e3..6ad5b62a7 100644 --- a/spacy/lang/de/norm_exceptions.py +++ b/spacy/lang/de/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Here we only want to include the absolute most common words. Otherwise, # this list would get impossibly long for German – especially considering the # old vs. new spelling rules, and all possible cases. diff --git a/spacy/lang/de/punctuation.py b/spacy/lang/de/punctuation.py index 7dfa61bd4..72f7e1022 100644 --- a/spacy/lang/de/punctuation.py +++ b/spacy/lang/de/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/de/stop_words.py b/spacy/lang/de/stop_words.py index cf3204d5e..df708e22e 100644 --- a/spacy/lang/de/stop_words.py +++ b/spacy/lang/de/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/de/syntax_iterators.py b/spacy/lang/de/syntax_iterators.py index 89d784a0c..410d2f0b4 100644 --- a/spacy/lang/de/syntax_iterators.py +++ b/spacy/lang/de/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index c169501a9..ca7ec61f1 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB diff --git a/spacy/lang/de/tokenizer_exceptions.py b/spacy/lang/de/tokenizer_exceptions.py index 5b09a0b89..3dd8507bc 100644 --- a/spacy/lang/de/tokenizer_exceptions.py +++ b/spacy/lang/de/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA diff --git a/spacy/lang/el/__init__.py b/spacy/lang/el/__init__.py index 16863e6d7..1ef7c503f 100644 --- a/spacy/lang/el/__init__.py +++ b/spacy/lang/el/__init__.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map_general import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/el/examples.py b/spacy/lang/el/examples.py index 521e7b30d..62515c07a 100644 --- a/spacy/lang/el/examples.py +++ b/spacy/lang/el/examples.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.el.examples import sentences diff --git a/spacy/lang/el/get_pos_from_wiktionary.py b/spacy/lang/el/get_pos_from_wiktionary.py index f41833974..01deb23a2 100644 --- a/spacy/lang/el/get_pos_from_wiktionary.py +++ b/spacy/lang/el/get_pos_from_wiktionary.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def get_pos_from_wiktionary(): import re diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py index 6f5b3999b..cf3a7fe97 100644 --- a/spacy/lang/el/lemmatizer.py +++ b/spacy/lang/el/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/el/lex_attrs.py b/spacy/lang/el/lex_attrs.py index cf32fe12c..5c8f96848 100644 --- a/spacy/lang/el/lex_attrs.py +++ b/spacy/lang/el/lex_attrs.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/el/norm_exceptions.py b/spacy/lang/el/norm_exceptions.py index d4384ff3c..d540aae2c 100644 --- a/spacy/lang/el/norm_exceptions.py +++ b/spacy/lang/el/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Norms are only set if no alternative is provided in the tokenizer exceptions. diff --git a/spacy/lang/el/punctuation.py b/spacy/lang/el/punctuation.py index fbf773f4d..2d5690407 100644 --- a/spacy/lang/el/punctuation.py +++ b/spacy/lang/el/punctuation.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from ..char_classes import LIST_ICONS, ALPHA_LOWER, ALPHA_UPPER, ALPHA, HYPHENS from ..char_classes import CONCAT_QUOTES, CURRENCY diff --git a/spacy/lang/el/stop_words.py b/spacy/lang/el/stop_words.py index f13c47ec2..8484826d1 100644 --- a/spacy/lang/el/stop_words.py +++ b/spacy/lang/el/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words # Link to greek stop words: https://www.translatum.gr/forum/index.php?topic=3550.0?topic=3550.0 diff --git a/spacy/lang/el/syntax_iterators.py b/spacy/lang/el/syntax_iterators.py index 5dfd44f07..988a36c80 100644 --- a/spacy/lang/el/syntax_iterators.py +++ b/spacy/lang/el/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/el/tag_map.py b/spacy/lang/el/tag_map.py index b346299bc..adfacd025 100644 --- a/spacy/lang/el/tag_map.py +++ b/spacy/lang/el/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, PRON, AUX diff --git a/spacy/lang/el/tag_map_general.py b/spacy/lang/el/tag_map_general.py index 42e64a013..d7e89d43a 100644 --- a/spacy/lang/el/tag_map_general.py +++ b/spacy/lang/el/tag_map_general.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ...symbols import PUNCT, NUM, AUX, X, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/el/tokenizer_exceptions.py b/spacy/lang/el/tokenizer_exceptions.py index a3c36542e..27ae1fe3a 100644 --- a/spacy/lang/el/tokenizer_exceptions.py +++ b/spacy/lang/el/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/en/__init__.py b/spacy/lang/en/__init__.py index fca4e01e7..fa01e2b60 100644 --- a/spacy/lang/en/__init__.py +++ b/spacy/lang/en/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .tag_map import TAG_MAP diff --git a/spacy/lang/en/examples.py b/spacy/lang/en/examples.py index 946289c7c..0363a45e7 100644 --- a/spacy/lang/en/examples.py +++ b/spacy/lang/en/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/en/lex_attrs.py b/spacy/lang/en/lex_attrs.py index f92d41139..96fb4c9fa 100644 --- a/spacy/lang/en/lex_attrs.py +++ b/spacy/lang/en/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/en/morph_rules.py b/spacy/lang/en/morph_rules.py index 5ed4eac59..aa3e6ce57 100644 --- a/spacy/lang/en/morph_rules.py +++ b/spacy/lang/en/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # Several entries here look pretty suspicious. These will get the POS SCONJ diff --git a/spacy/lang/en/norm_exceptions.py b/spacy/lang/en/norm_exceptions.py index a2cf58b8a..431d9c049 100644 --- a/spacy/lang/en/norm_exceptions.py +++ b/spacy/lang/en/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang and abbreviations diff --git a/spacy/lang/en/stop_words.py b/spacy/lang/en/stop_words.py index 3505b13bf..4573c9411 100644 --- a/spacy/lang/en/stop_words.py +++ b/spacy/lang/en/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words STOP_WORDS = set( diff --git a/spacy/lang/en/syntax_iterators.py b/spacy/lang/en/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/en/syntax_iterators.py +++ b/spacy/lang/en/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index ecb3103cc..2078798f7 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py index c45197771..776948c28 100644 --- a/spacy/lang/en/tokenizer_exceptions.py +++ b/spacy/lang/en/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA diff --git a/spacy/lang/es/__init__.py b/spacy/lang/es/__init__.py index 80cc1727c..060bd8fc6 100644 --- a/spacy/lang/es/__init__.py +++ b/spacy/lang/es/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py index 0e31b56af..1c1ad631b 100644 --- a/spacy/lang/es/examples.py +++ b/spacy/lang/es/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/es/lex_attrs.py b/spacy/lang/es/lex_attrs.py index 03ada1f43..d2a3c891a 100644 --- a/spacy/lang/es/lex_attrs.py +++ b/spacy/lang/es/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/es/stop_words.py b/spacy/lang/es/stop_words.py index 20e929b48..3d46a88cb 100644 --- a/spacy/lang/es/stop_words.py +++ b/spacy/lang/es/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/es/syntax_iterators.py b/spacy/lang/es/syntax_iterators.py index 6a78d86f7..e998cd1d6 100644 --- a/spacy/lang/es/syntax_iterators.py +++ b/spacy/lang/es/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON, VERB, AUX diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 7a7c9d549..1748162c0 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/es/tokenizer_exceptions.py b/spacy/lang/es/tokenizer_exceptions.py index 9109d658b..1cd5941be 100644 --- a/spacy/lang/es/tokenizer_exceptions.py +++ b/spacy/lang/es/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA diff --git a/spacy/lang/et/__init__.py b/spacy/lang/et/__init__.py index d84c081ef..e0b0a8a87 100644 --- a/spacy/lang/et/__init__.py +++ b/spacy/lang/et/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/et/stop_words.py b/spacy/lang/et/stop_words.py index 15070db5f..3b600a158 100644 --- a/spacy/lang/et/stop_words.py +++ b/spacy/lang/et/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-et diff --git a/spacy/lang/fa/__init__.py b/spacy/lang/fa/__init__.py index 9d85f814a..aa02855e9 100644 --- a/spacy/lang/fa/__init__.py +++ b/spacy/lang/fa/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...language import Language from ...attrs import LANG, NORM from ...util import update_exc, add_lookups diff --git a/spacy/lang/fa/examples.py b/spacy/lang/fa/examples.py index 3f65a366d..d89feb6c8 100644 --- a/spacy/lang/fa/examples.py +++ b/spacy/lang/fa/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fa/generate_verbs_exc.py b/spacy/lang/fa/generate_verbs_exc.py index 5d0ff944d..61586dc3f 100644 --- a/spacy/lang/fa/generate_verbs_exc.py +++ b/spacy/lang/fa/generate_verbs_exc.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - verb_roots = """ #هست diff --git a/spacy/lang/fa/lex_attrs.py b/spacy/lang/fa/lex_attrs.py index dbea66b68..99b8e2787 100644 --- a/spacy/lang/fa/lex_attrs.py +++ b/spacy/lang/fa/lex_attrs.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...attrs import LIKE_NUM diff --git a/spacy/lang/fa/punctuation.py b/spacy/lang/fa/punctuation.py index 33aa46ae2..4b258c13d 100644 --- a/spacy/lang/fa/punctuation.py +++ b/spacy/lang/fa/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import UNITS, ALPHA_UPPER diff --git a/spacy/lang/fa/stop_words.py b/spacy/lang/fa/stop_words.py index 682fb7a71..372422b67 100644 --- a/spacy/lang/fa/stop_words.py +++ b/spacy/lang/fa/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words from HAZM package STOP_WORDS = set( diff --git a/spacy/lang/fa/syntax_iterators.py b/spacy/lang/fa/syntax_iterators.py index ed665ef29..86695cf6f 100644 --- a/spacy/lang/fa/syntax_iterators.py +++ b/spacy/lang/fa/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fa/tag_map.py b/spacy/lang/fa/tag_map.py index b9043adf0..f1f106915 100644 --- a/spacy/lang/fa/tag_map.py +++ b/spacy/lang/fa/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import PRON, NOUN, PART, INTJ, AUX diff --git a/spacy/lang/fa/tokenizer_exceptions.py b/spacy/lang/fa/tokenizer_exceptions.py index b3f8dcbf5..db9e3f6fc 100644 --- a/spacy/lang/fa/tokenizer_exceptions.py +++ b/spacy/lang/fa/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, TAG, NORM diff --git a/spacy/lang/fi/__init__.py b/spacy/lang/fi/__init__.py index 45d2f886f..db58ad3ba 100644 --- a/spacy/lang/fi/__init__.py +++ b/spacy/lang/fi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/fi/examples.py b/spacy/lang/fi/examples.py index 88be248a6..930fac273 100644 --- a/spacy/lang/fi/examples.py +++ b/spacy/lang/fi/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.fi.examples import sentences diff --git a/spacy/lang/fi/lex_attrs.py b/spacy/lang/fi/lex_attrs.py index e960b55eb..4d500cead 100644 --- a/spacy/lang/fi/lex_attrs.py +++ b/spacy/lang/fi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fi/punctuation.py b/spacy/lang/fi/punctuation.py index 02eb1b200..878c8e250 100644 --- a/spacy/lang/fi/punctuation.py +++ b/spacy/lang/fi/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/fi/stop_words.py b/spacy/lang/fi/stop_words.py index e8e39ec6f..642cfc369 100644 --- a/spacy/lang/fi/stop_words.py +++ b/spacy/lang/fi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source https://github.com/stopwords-iso/stopwords-fi/blob/master/stopwords-fi.txt # Reformatted with some minor corrections diff --git a/spacy/lang/fi/tokenizer_exceptions.py b/spacy/lang/fi/tokenizer_exceptions.py index d74deb22b..44360e969 100644 --- a/spacy/lang/fi/tokenizer_exceptions.py +++ b/spacy/lang/fi/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/fr/__init__.py b/spacy/lang/fr/__init__.py index f56c8688a..dc45e538c 100644 --- a/spacy/lang/fr/__init__.py +++ b/spacy/lang/fr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/fr/_tokenizer_exceptions_list.py b/spacy/lang/fr/_tokenizer_exceptions_list.py index c9fcfff2d..7f908dac8 100644 --- a/spacy/lang/fr/_tokenizer_exceptions_list.py +++ b/spacy/lang/fr/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - FR_BASE_EXCEPTIONS = [ "(+)-amphétamine", "(5R,6S)-7,8-didehydro-4,5-époxy-3-méthoxy-N-méthylmorphinan-6-ol", diff --git a/spacy/lang/fr/examples.py b/spacy/lang/fr/examples.py index a874c22fc..57d57f4a6 100644 --- a/spacy/lang/fr/examples.py +++ b/spacy/lang/fr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/fr/lemmatizer.py b/spacy/lang/fr/lemmatizer.py index 79f4dd28d..84e55d509 100644 --- a/spacy/lang/fr/lemmatizer.py +++ b/spacy/lang/fr/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import POS, NOUN, VERB, ADJ, ADV, PRON, DET, AUX, PUNCT, ADP from ...symbols import SCONJ, CCONJ diff --git a/spacy/lang/fr/lex_attrs.py b/spacy/lang/fr/lex_attrs.py index e3ccd9fdd..da98c6e37 100644 --- a/spacy/lang/fr/lex_attrs.py +++ b/spacy/lang/fr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/fr/punctuation.py b/spacy/lang/fr/punctuation.py index 1422b4194..5f42e7f25 100644 --- a/spacy/lang/fr/punctuation.py +++ b/spacy/lang/fr/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CURRENCY from ..char_classes import CONCAT_QUOTES, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/fr/stop_words.py b/spacy/lang/fr/stop_words.py index ae8432043..9c12e49a3 100644 --- a/spacy/lang/fr/stop_words.py +++ b/spacy/lang/fr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/fr/syntax_iterators.py b/spacy/lang/fr/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/fr/syntax_iterators.py +++ b/spacy/lang/fr/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/fr/tag_map.py b/spacy/lang/fr/tag_map.py index 93b43c2ec..2b1b20c52 100644 --- a/spacy/lang/fr/tag_map.py +++ b/spacy/lang/fr/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, SCONJ diff --git a/spacy/lang/fr/tokenizer_exceptions.py b/spacy/lang/fr/tokenizer_exceptions.py index 4b3b2c908..b1c0a53af 100644 --- a/spacy/lang/fr/tokenizer_exceptions.py +++ b/spacy/lang/fr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from .punctuation import ELISION, HYPHENS @@ -70,7 +67,7 @@ for verb, verb_lemma in [ ]: for orth in [verb, verb.title()]: for pronoun in ["elle", "il", "on"]: - token = "{}-t-{}".format(orth, pronoun) + token = f"{orth}-t-{pronoun}" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "t", ORTH: "-t"}, @@ -79,7 +76,7 @@ for verb, verb_lemma in [ for verb, verb_lemma in [("est", "être")]: for orth in [verb, verb.title()]: - token = "{}-ce".format(orth) + token = f"{orth}-ce" _exc[token] = [ {LEMMA: verb_lemma, ORTH: orth, TAG: "VERB"}, {LEMMA: "ce", ORTH: "-ce"}, diff --git a/spacy/lang/ga/__init__.py b/spacy/lang/ga/__init__.py index 42b4d0d18..cea7c0e94 100644 --- a/spacy/lang/ga/__init__.py +++ b/spacy/lang/ga/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/ga/irish_morphology_helpers.py b/spacy/lang/ga/irish_morphology_helpers.py index 2133f0d22..c8cd36835 100644 --- a/spacy/lang/ga/irish_morphology_helpers.py +++ b/spacy/lang/ga/irish_morphology_helpers.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # fmt: off consonants = ["b", "c", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r", "s", "t", "v", "w", "x", "z"] diff --git a/spacy/lang/ga/stop_words.py b/spacy/lang/ga/stop_words.py index d8f705b59..4ef052ca5 100644 --- a/spacy/lang/ga/stop_words.py +++ b/spacy/lang/ga/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ a ach ag agus an aon ar arna as diff --git a/spacy/lang/ga/tag_map.py b/spacy/lang/ga/tag_map.py index 1d8284014..baf64c1b8 100644 --- a/spacy/lang/ga/tag_map.py +++ b/spacy/lang/ga/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # fmt: off TAG_MAP = { "ADJ__Case=Gen|Form=Len|Gender=Masc|Number=Sing": {"pos": "ADJ", "Case": "gen", "Gender": "masc", "Number": "sing", "Other": {"Form": "len"}}, diff --git a/spacy/lang/ga/tokenizer_exceptions.py b/spacy/lang/ga/tokenizer_exceptions.py index c0e53f522..0c587c67e 100644 --- a/spacy/lang/ga/tokenizer_exceptions.py +++ b/spacy/lang/ga/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, DET, ADP, CCONJ, ADV, NOUN, X, AUX from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/he/__init__.py b/spacy/lang/he/__init__.py index 411cdf107..0d324f64c 100644 --- a/spacy/lang/he/__init__.py +++ b/spacy/lang/he/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/he/examples.py b/spacy/lang/he/examples.py index 34cd157ae..29075c7d4 100644 --- a/spacy/lang/he/examples.py +++ b/spacy/lang/he/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/he/stop_words.py b/spacy/lang/he/stop_words.py index a01ec4246..2745460a7 100644 --- a/spacy/lang/he/stop_words.py +++ b/spacy/lang/he/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ אני diff --git a/spacy/lang/hi/__init__.py b/spacy/lang/hi/__init__.py index b0d45ddf3..9a96de95c 100644 --- a/spacy/lang/hi/__init__.py +++ b/spacy/lang/hi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/hi/examples.py b/spacy/lang/hi/examples.py index 1dd182532..7639ff940 100644 --- a/spacy/lang/hi/examples.py +++ b/spacy/lang/hi/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hi/lex_attrs.py b/spacy/lang/hi/lex_attrs.py index 12666d96a..20a8c2975 100644 --- a/spacy/lang/hi/lex_attrs.py +++ b/spacy/lang/hi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..norm_exceptions import BASE_NORMS from ...attrs import NORM, LIKE_NUM diff --git a/spacy/lang/hi/stop_words.py b/spacy/lang/hi/stop_words.py index efad18c84..142fc6f47 100644 --- a/spacy/lang/hi/stop_words.py +++ b/spacy/lang/hi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/taranjeet/hindi-tokenizer/blob/master/stopwords.txt, https://data.mendeley.com/datasets/bsr3frvvjc/1#file-a21d5092-99d7-45d8-b044-3ae9edd391c6 diff --git a/spacy/lang/hr/__init__.py b/spacy/lang/hr/__init__.py index 539b164d7..fbc66ece0 100644 --- a/spacy/lang/hr/__init__.py +++ b/spacy/lang/hr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/hr/examples.py b/spacy/lang/hr/examples.py index dc52ce4f0..b28fb63c2 100644 --- a/spacy/lang/hr/examples.py +++ b/spacy/lang/hr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hr/stop_words.py b/spacy/lang/hr/stop_words.py index 408b802c5..dd10f792d 100644 --- a/spacy/lang/hr/stop_words.py +++ b/spacy/lang/hr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-hr STOP_WORDS = set( """ diff --git a/spacy/lang/hu/__init__.py b/spacy/lang/hu/__init__.py index a331adc5b..df3fe4a44 100644 --- a/spacy/lang/hu/__init__.py +++ b/spacy/lang/hu/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH from .punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/hu/examples.py b/spacy/lang/hu/examples.py index 3267887fe..b60f752ec 100644 --- a/spacy/lang/hu/examples.py +++ b/spacy/lang/hu/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/hu/punctuation.py b/spacy/lang/hu/punctuation.py index bc043486f..1fea6d510 100644 --- a/spacy/lang/hu/punctuation.py +++ b/spacy/lang/hu/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, CONCAT_QUOTES from ..char_classes import CONCAT_ICONS, UNITS, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/hu/stop_words.py b/spacy/lang/hu/stop_words.py index c9a217dd6..024af68f4 100644 --- a/spacy/lang/hu/stop_words.py +++ b/spacy/lang/hu/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/hu/tokenizer_exceptions.py b/spacy/lang/hu/tokenizer_exceptions.py index c18a2cec2..cc5eede17 100644 --- a/spacy/lang/hu/tokenizer_exceptions.py +++ b/spacy/lang/hu/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from ..punctuation import ALPHA_LOWER, CURRENCY diff --git a/spacy/lang/id/__init__.py b/spacy/lang/id/__init__.py index ea8e355ac..89f874abe 100644 --- a/spacy/lang/id/__init__.py +++ b/spacy/lang/id/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .punctuation import TOKENIZER_SUFFIXES, TOKENIZER_PREFIXES, TOKENIZER_INFIXES from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/id/_tokenizer_exceptions_list.py b/spacy/lang/id/_tokenizer_exceptions_list.py index fec878d5a..a0b35fa1a 100644 --- a/spacy/lang/id/_tokenizer_exceptions_list.py +++ b/spacy/lang/id/_tokenizer_exceptions_list.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - ID_BASE_EXCEPTIONS = set( """ aba-aba diff --git a/spacy/lang/id/examples.py b/spacy/lang/id/examples.py index 56ac9165e..2ce46ce5a 100644 --- a/spacy/lang/id/examples.py +++ b/spacy/lang/id/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/id/lex_attrs.py b/spacy/lang/id/lex_attrs.py index 1d4584ae3..3167f4659 100644 --- a/spacy/lang/id/lex_attrs.py +++ b/spacy/lang/id/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from .punctuation import LIST_CURRENCY diff --git a/spacy/lang/id/norm_exceptions.py b/spacy/lang/id/norm_exceptions.py index 09ac6a6d3..63d2081e9 100644 --- a/spacy/lang/id/norm_exceptions.py +++ b/spacy/lang/id/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Daftar kosakata yang sering salah dieja # https://id.wikipedia.org/wiki/Wikipedia:Daftar_kosakata_bahasa_Indonesia_yang_sering_salah_dieja _exc = { diff --git a/spacy/lang/id/punctuation.py b/spacy/lang/id/punctuation.py index e4794d42b..f6c2387d8 100644 --- a/spacy/lang/id/punctuation.py +++ b/spacy/lang/id/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from ..char_classes import ALPHA, merge_chars, split_chars, _currency, _units diff --git a/spacy/lang/id/stop_words.py b/spacy/lang/id/stop_words.py index 0a9f91947..b1bfaea79 100644 --- a/spacy/lang/id/stop_words.py +++ b/spacy/lang/id/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ ada adalah adanya adapun agak agaknya agar akan akankah akhir akhiri akhirnya diff --git a/spacy/lang/id/syntax_iterators.py b/spacy/lang/id/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/id/syntax_iterators.py +++ b/spacy/lang/id/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/id/tag_map.py b/spacy/lang/id/tag_map.py index 16391a840..3bd08e96a 100644 --- a/spacy/lang/id/tag_map.py +++ b/spacy/lang/id/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PRON, AUX, SCONJ, INTJ, PART, PROPN diff --git a/spacy/lang/id/tokenizer_exceptions.py b/spacy/lang/id/tokenizer_exceptions.py index 86fe611bf..5259bddf8 100644 --- a/spacy/lang/id/tokenizer_exceptions.py +++ b/spacy/lang/id/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import ID_BASE_EXCEPTIONS from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/is/__init__.py b/spacy/lang/is/__init__.py index 18e41432d..cdcfd6e71 100644 --- a/spacy/lang/is/__init__.py +++ b/spacy/lang/is/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/is/stop_words.py b/spacy/lang/is/stop_words.py index e4ae0498b..5b3ff2f5a 100644 --- a/spacy/lang/is/stop_words.py +++ b/spacy/lang/is/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Xangis/extra-stopwords diff --git a/spacy/lang/it/__init__.py b/spacy/lang/it/__init__.py index 90763eda5..4b223582b 100644 --- a/spacy/lang/it/__init__.py +++ b/spacy/lang/it/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS diff --git a/spacy/lang/it/examples.py b/spacy/lang/it/examples.py index af66b7eca..30327bd14 100644 --- a/spacy/lang/it/examples.py +++ b/spacy/lang/it/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py index 4fa931fde..0b8405cc0 100644 --- a/spacy/lang/it/punctuation.py +++ b/spacy/lang/it/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_INFIXES from ..char_classes import ALPHA diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py index 84233d381..5cd1af137 100644 --- a/spacy/lang/it/stop_words.py +++ b/spacy/lang/it/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/it/tag_map.py b/spacy/lang/it/tag_map.py index 798c45d80..ce0e1d9ee 100644 --- a/spacy/lang/it/tag_map.py +++ b/spacy/lang/it/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX, CONJ diff --git a/spacy/lang/it/tokenizer_exceptions.py b/spacy/lang/it/tokenizer_exceptions.py index 62f568c5c..f1cfba2c0 100644 --- a/spacy/lang/it/tokenizer_exceptions.py +++ b/spacy/lang/it/tokenizer_exceptions.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from ...symbols import ORTH, LEMMA _exc = {"po'": [{ORTH: "po'", LEMMA: "poco"}]} diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 22590043f..d1ce651d7 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - import re from collections import namedtuple diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py index e00001ed5..1d532ad77 100644 --- a/spacy/lang/ja/examples.py +++ b/spacy/lang/ja/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ja/stop_words.py b/spacy/lang/ja/stop_words.py index bb232a2d2..98560d7e2 100644 --- a/spacy/lang/ja/stop_words.py +++ b/spacy/lang/ja/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # This list was created by taking the top 2000 words from a Wikipedia dump and # filtering out everything that wasn't hiragana. ー (one) was also added. # Considered keeping some non-hiragana words but too many place names were diff --git a/spacy/lang/ja/tag_map.py b/spacy/lang/ja/tag_map.py index 4ff0a35ee..d922cd22b 100644 --- a/spacy/lang/ja/tag_map.py +++ b/spacy/lang/ja/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE diff --git a/spacy/lang/kn/__init__.py b/spacy/lang/kn/__init__.py index c86354248..ef3b10f81 100644 --- a/spacy/lang/kn/__init__.py +++ b/spacy/lang/kn/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/kn/stop_words.py b/spacy/lang/kn/stop_words.py index 652341e73..cfeb0e69d 100644 --- a/spacy/lang/kn/stop_words.py +++ b/spacy/lang/kn/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index ec79a95ab..4ecdfbc58 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tag_map import TAG_MAP from ...attrs import LANG diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py index 7885ad801..cc0a66c0a 100644 --- a/spacy/lang/ko/examples.py +++ b/spacy/lang/ko/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ko/lex_attrs.py b/spacy/lang/ko/lex_attrs.py index 1904a0ece..ac5bc7e48 100644 --- a/spacy/lang/ko/lex_attrs.py +++ b/spacy/lang/ko/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py index 676dca1b4..3eba9fc82 100644 --- a/spacy/lang/ko/stop_words.py +++ b/spacy/lang/ko/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ 이 diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py index 57317c969..26a8c56b9 100644 --- a/spacy/lang/ko/tag_map.py +++ b/spacy/lang/ko/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON from ...symbols import VERB, ADV, PROPN, NUM, DET diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py index 4fcfaddb4..afcf77f33 100644 --- a/spacy/lang/lb/__init__.py +++ b/spacy/lang/lb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES diff --git a/spacy/lang/lb/examples.py b/spacy/lang/lb/examples.py index 3cbba31d9..a7a10489c 100644 --- a/spacy/lang/lb/examples.py +++ b/spacy/lang/lb/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py index e38c74974..d2d50d9dc 100644 --- a/spacy/lang/lb/lex_attrs.py +++ b/spacy/lang/lb/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py index 7063e6863..afc384228 100644 --- a/spacy/lang/lb/norm_exceptions.py +++ b/spacy/lang/lb/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # TODO # norm execptions: find a possibility to deal with the zillions of spelling # variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.) diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py index 1571e13d7..4886b316c 100644 --- a/spacy/lang/lb/punctuation.py +++ b/spacy/lang/lb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS, ALPHA, ALPHA_LOWER, ALPHA_UPPER ELISION = " ' ’ ".strip().replace(" ", "") diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py index 41e6f79d2..8f22ea6e6 100644 --- a/spacy/lang/lb/stop_words.py +++ b/spacy/lang/lb/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ a diff --git a/spacy/lang/lb/tag_map.py b/spacy/lang/lb/tag_map.py index 424a83bb4..cd2e8b93c 100644 --- a/spacy/lang/lb/tag_map.py +++ b/spacy/lang/lb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PART, SPACE, AUX diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py index b32daa58c..ebf624281 100644 --- a/spacy/lang/lb/tokenizer_exceptions.py +++ b/spacy/lang/lb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM # TODO diff --git a/spacy/lang/lex_attrs.py b/spacy/lang/lex_attrs.py index 7c0ed8a04..339290d4a 100644 --- a/spacy/lang/lex_attrs.py +++ b/spacy/lang/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata import re diff --git a/spacy/lang/lt/__init__.py b/spacy/lang/lt/__init__.py index 7919a4858..0f096a5b7 100644 --- a/spacy/lang/lt/__init__.py +++ b/spacy/lang/lt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/lt/examples.py b/spacy/lang/lt/examples.py index 99dbe9d4d..b2889114c 100644 --- a/spacy/lang/lt/examples.py +++ b/spacy/lang/lt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/lt/lex_attrs.py b/spacy/lang/lt/lex_attrs.py index 81879948f..28894a59b 100644 --- a/spacy/lang/lt/lex_attrs.py +++ b/spacy/lang/lt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = { diff --git a/spacy/lang/lt/morph_rules.py b/spacy/lang/lt/morph_rules.py index 3bf26d9d8..f7bfd3cc6 100644 --- a/spacy/lang/lt/morph_rules.py +++ b/spacy/lang/lt/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/lt/stop_words.py b/spacy/lang/lt/stop_words.py index fed05d80d..8c11b3f7b 100644 --- a/spacy/lang/lt/stop_words.py +++ b/spacy/lang/lt/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = { "a", "abejais", diff --git a/spacy/lang/lt/tag_map.py b/spacy/lang/lt/tag_map.py index 6ea4f8ae0..f08db535f 100644 --- a/spacy/lang/lt/tag_map.py +++ b/spacy/lang/lt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, CONJ, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X diff --git a/spacy/lang/lt/tokenizer_exceptions.py b/spacy/lang/lt/tokenizer_exceptions.py index fcf807278..e4b53e5b7 100644 --- a/spacy/lang/lt/tokenizer_exceptions.py +++ b/spacy/lang/lt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH _exc = {} diff --git a/spacy/lang/lv/__init__.py b/spacy/lang/lv/__init__.py index bb8c0763b..dd8919b73 100644 --- a/spacy/lang/lv/__init__.py +++ b/spacy/lang/lv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/lv/stop_words.py b/spacy/lang/lv/stop_words.py index 075ad6347..a9612f949 100644 --- a/spacy/lang/lv/stop_words.py +++ b/spacy/lang/lv/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-lv diff --git a/spacy/lang/mr/__init__.py b/spacy/lang/mr/__init__.py index fd95f9354..eb52a3935 100644 --- a/spacy/lang/mr/__init__.py +++ b/spacy/lang/mr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/mr/stop_words.py b/spacy/lang/mr/stop_words.py index 0b0cd035d..0d7501461 100644 --- a/spacy/lang/mr/stop_words.py +++ b/spacy/lang/mr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-mr/blob/master/stopwords-mr.txt, https://github.com/6/stopwords-json/edit/master/dist/mr.json STOP_WORDS = set( diff --git a/spacy/lang/nb/__init__.py b/spacy/lang/nb/__init__.py index 086761f82..3120951a2 100644 --- a/spacy/lang/nb/__init__.py +++ b/spacy/lang/nb/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .morph_rules import MORPH_RULES diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py index c15426ded..89e265951 100644 --- a/spacy/lang/nb/examples.py +++ b/spacy/lang/nb/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nb/morph_rules.py b/spacy/lang/nb/morph_rules.py index e20814535..b1799fca8 100644 --- a/spacy/lang/nb/morph_rules.py +++ b/spacy/lang/nb/morph_rules.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA # This dict includes all the PRON and DET tag combinations found in the diff --git a/spacy/lang/nb/punctuation.py b/spacy/lang/nb/punctuation.py index b49aa9838..5d5800ae3 100644 --- a/spacy/lang/nb/punctuation.py +++ b/spacy/lang/nb/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/nb/stop_words.py b/spacy/lang/nb/stop_words.py index caa2012e7..fd65dd788 100644 --- a/spacy/lang/nb/stop_words.py +++ b/spacy/lang/nb/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ alle allerede alt and andre annen annet at av diff --git a/spacy/lang/nb/syntax_iterators.py b/spacy/lang/nb/syntax_iterators.py index 4712d34d9..96636b0b7 100644 --- a/spacy/lang/nb/syntax_iterators.py +++ b/spacy/lang/nb/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/nb/tag_map.py b/spacy/lang/nb/tag_map.py index ca0ece265..a67586ed9 100644 --- a/spacy/lang/nb/tag_map.py +++ b/spacy/lang/nb/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, SCONJ, SYM, NUM, DET, ADV, ADP, X from ...symbols import VERB, NOUN, PROPN, PART, INTJ, PRON, AUX diff --git a/spacy/lang/nb/tokenizer_exceptions.py b/spacy/lang/nb/tokenizer_exceptions.py index 92ac09841..ef6dcf264 100644 --- a/spacy/lang/nb/tokenizer_exceptions.py +++ b/spacy/lang/nb/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/nl/__init__.py b/spacy/lang/nl/__init__.py index 074fd9133..c12b08d77 100644 --- a/spacy/lang/nl/__init__.py +++ b/spacy/lang/nl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .tag_map import TAG_MAP diff --git a/spacy/lang/nl/examples.py b/spacy/lang/nl/examples.py index a459760f4..fcefa9d62 100644 --- a/spacy/lang/nl/examples.py +++ b/spacy/lang/nl/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/nl/lemmatizer.py b/spacy/lang/nl/lemmatizer.py index 9a92bee44..e7501ec52 100644 --- a/spacy/lang/nl/lemmatizer.py +++ b/spacy/lang/nl/lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...lemmatizer import Lemmatizer from ...symbols import NOUN, VERB, ADJ, NUM, DET, PRON, ADP, AUX, ADV diff --git a/spacy/lang/nl/lex_attrs.py b/spacy/lang/nl/lex_attrs.py index 69343b589..f1acaefeb 100644 --- a/spacy/lang/nl/lex_attrs.py +++ b/spacy/lang/nl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/nl/punctuation.py b/spacy/lang/nl/punctuation.py index a48ecc044..3f3be61f8 100644 --- a/spacy/lang/nl/punctuation.py +++ b/spacy/lang/nl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, LIST_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py index 44551f2d4..a2c6198e7 100644 --- a/spacy/lang/nl/stop_words.py +++ b/spacy/lang/nl/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # The original stop words list (added in f46ffe3) was taken from # http://www.damienvanholten.com/downloads/dutch-stop-words.txt # and consisted of about 100 tokens. diff --git a/spacy/lang/nl/tag_map.py b/spacy/lang/nl/tag_map.py index 4fde5d39f..5bd7747c6 100644 --- a/spacy/lang/nl/tag_map.py +++ b/spacy/lang/nl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, SPACE, PRON, CONJ diff --git a/spacy/lang/nl/tokenizer_exceptions.py b/spacy/lang/nl/tokenizer_exceptions.py index dbdd104f3..12ab8aef5 100644 --- a/spacy/lang/nl/tokenizer_exceptions.py +++ b/spacy/lang/nl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH # Extensive list of both common and uncommon dutch abbreviations copied from diff --git a/spacy/lang/norm_exceptions.py b/spacy/lang/norm_exceptions.py index 341967a78..c194f05c7 100644 --- a/spacy/lang/norm_exceptions.py +++ b/spacy/lang/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - diff --git a/spacy/lang/pl/__init__.py b/spacy/lang/pl/__init__.py index 702a19063..a03ead1ff 100644 --- a/spacy/lang/pl/__init__.py +++ b/spacy/lang/pl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .punctuation import TOKENIZER_INFIXES from .tag_map import TAG_MAP diff --git a/spacy/lang/pl/_tokenizer_exceptions_list.py b/spacy/lang/pl/_tokenizer_exceptions_list.py index 839eccb83..965318442 100644 --- a/spacy/lang/pl/_tokenizer_exceptions_list.py +++ b/spacy/lang/pl/_tokenizer_exceptions_list.py @@ -1,7 +1,3 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - # The following list consists of: # - exceptions generated from polish_srx_rules [1] # (https://github.com/milekpl/polish_srx_rules) diff --git a/spacy/lang/pl/examples.py b/spacy/lang/pl/examples.py index 14b6c7030..6eabe1843 100644 --- a/spacy/lang/pl/examples.py +++ b/spacy/lang/pl/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pl/lex_attrs.py b/spacy/lang/pl/lex_attrs.py index f1379aa50..ce56e28a8 100644 --- a/spacy/lang/pl/lex_attrs.py +++ b/spacy/lang/pl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pl/punctuation.py b/spacy/lang/pl/punctuation.py index 4e69a3912..eea28de11 100644 --- a/spacy/lang/pl/punctuation.py +++ b/spacy/lang/pl/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import LIST_ELLIPSES, CONCAT_ICONS from ..char_classes import CONCAT_QUOTES, ALPHA, ALPHA_LOWER, ALPHA_UPPER diff --git a/spacy/lang/pl/stop_words.py b/spacy/lang/pl/stop_words.py index 11df67328..075aec391 100644 --- a/spacy/lang/pl/stop_words.py +++ b/spacy/lang/pl/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 - -from __future__ import unicode_literals - # sources: https://github.com/bieli/stopwords/blob/master/polish.stopwords.txt and https://github.com/stopwords-iso/stopwords-pl STOP_WORDS = set( diff --git a/spacy/lang/pl/tag_map.py b/spacy/lang/pl/tag_map.py index 5356c26cb..b83ee4d4c 100644 --- a/spacy/lang/pl/tag_map.py +++ b/spacy/lang/pl/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ( POS, ADJ, diff --git a/spacy/lang/pl/tokenizer_exceptions.py b/spacy/lang/pl/tokenizer_exceptions.py index 9e4814b0f..39f3017ed 100644 --- a/spacy/lang/pl/tokenizer_exceptions.py +++ b/spacy/lang/pl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ._tokenizer_exceptions_list import PL_BASE_EXCEPTIONS from ...symbols import POS, ADV, NOUN, ORTH, LEMMA, ADJ diff --git a/spacy/lang/pt/__init__.py b/spacy/lang/pt/__init__.py index f786d6542..0557e8b31 100644 --- a/spacy/lang/pt/__init__.py +++ b/spacy/lang/pt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/pt/examples.py b/spacy/lang/pt/examples.py index b7206ffd7..7427f8b25 100644 --- a/spacy/lang/pt/examples.py +++ b/spacy/lang/pt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/pt/lex_attrs.py b/spacy/lang/pt/lex_attrs.py index 4ad0eeecb..3c6979ab4 100644 --- a/spacy/lang/pt/lex_attrs.py +++ b/spacy/lang/pt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/pt/norm_exceptions.py b/spacy/lang/pt/norm_exceptions.py index ea650cb31..e115b0385 100644 --- a/spacy/lang/pt/norm_exceptions.py +++ b/spacy/lang/pt/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # These exceptions are used to add NORM values based on a token's ORTH value. # Individual languages can also add their own exceptions and overwrite them - # for example, British vs. American spelling in English. diff --git a/spacy/lang/pt/punctuation.py b/spacy/lang/pt/punctuation.py index 370e6aaad..08e31f9d0 100644 --- a/spacy/lang/pt/punctuation.py +++ b/spacy/lang/pt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_PREFIXES as BASE_TOKENIZER_PREFIXES from ..punctuation import TOKENIZER_SUFFIXES as BASE_TOKENIZER_SUFFIXES from ..punctuation import TOKENIZER_INFIXES as BASE_TOKENIZER_INFIXES diff --git a/spacy/lang/pt/stop_words.py b/spacy/lang/pt/stop_words.py index 774b06809..8065fcda7 100644 --- a/spacy/lang/pt/stop_words.py +++ b/spacy/lang/pt/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/pt/tag_map.py b/spacy/lang/pt/tag_map.py index cdc7de57e..dc65998a4 100644 --- a/spacy/lang/pt/tag_map.py +++ b/spacy/lang/pt/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, CCONJ from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, SCONJ, AUX diff --git a/spacy/lang/pt/tokenizer_exceptions.py b/spacy/lang/pt/tokenizer_exceptions.py index 5169780e6..2089ea8fa 100644 --- a/spacy/lang/pt/tokenizer_exceptions.py +++ b/spacy/lang/pt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM diff --git a/spacy/lang/punctuation.py b/spacy/lang/punctuation.py index ccb72de28..bf7357e48 100644 --- a/spacy/lang/punctuation.py +++ b/spacy/lang/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT diff --git a/spacy/lang/ro/__init__.py b/spacy/lang/ro/__init__.py index 6c325b74d..e32ae19cb 100644 --- a/spacy/lang/ro/__init__.py +++ b/spacy/lang/ro/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/ro/examples.py b/spacy/lang/ro/examples.py index a372d7cb2..d472f0d6d 100644 --- a/spacy/lang/ro/examples.py +++ b/spacy/lang/ro/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ro/lex_attrs.py b/spacy/lang/ro/lex_attrs.py index bb8391ad1..0f86f53cd 100644 --- a/spacy/lang/ro/lex_attrs.py +++ b/spacy/lang/ro/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ro/stop_words.py b/spacy/lang/ro/stop_words.py index b5ba73458..1d90be85d 100644 --- a/spacy/lang/ro/stop_words.py +++ b/spacy/lang/ro/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-ro STOP_WORDS = set( """ diff --git a/spacy/lang/ro/tag_map.py b/spacy/lang/ro/tag_map.py index cb5239809..d6820b4f2 100644 --- a/spacy/lang/ro/tag_map.py +++ b/spacy/lang/ro/tag_map.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from ...symbols import POS, ADJ, ADP, ADV, INTJ, NOUN, NUM, PART from ...symbols import PRON, PROPN, PUNCT, SYM, VERB, X, CCONJ, SCONJ, DET, AUX diff --git a/spacy/lang/ro/tokenizer_exceptions.py b/spacy/lang/ro/tokenizer_exceptions.py index a7fb38453..8408ef987 100644 --- a/spacy/lang/ro/tokenizer_exceptions.py +++ b/spacy/lang/ro/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH diff --git a/spacy/lang/ru/__init__.py b/spacy/lang/ru/__init__.py index f34fc5435..d25e8048b 100644 --- a/spacy/lang/ru/__init__.py +++ b/spacy/lang/ru/__init__.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals, print_function - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/ru/examples.py b/spacy/lang/ru/examples.py index 2db621dac..34cf5a1eb 100644 --- a/spacy/lang/ru/examples.py +++ b/spacy/lang/ru/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py index 96d32f59c..ed0e858f5 100644 --- a/spacy/lang/ru/lemmatizer.py +++ b/spacy/lang/ru/lemmatizer.py @@ -1,9 +1,5 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer -from ...compat import unicode_ class RussianLemmatizer(Lemmatizer): @@ -85,7 +81,7 @@ class RussianLemmatizer(Lemmatizer): @staticmethod def normalize_univ_pos(univ_pos): - if isinstance(univ_pos, unicode_): + if isinstance(univ_pos, str): return univ_pos.upper() symbols_to_str = { diff --git a/spacy/lang/ru/lex_attrs.py b/spacy/lang/ru/lex_attrs.py index 448c5b285..7979c7ea6 100644 --- a/spacy/lang/ru/lex_attrs.py +++ b/spacy/lang/ru/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ru/norm_exceptions.py b/spacy/lang/ru/norm_exceptions.py index 43e08948c..c5d725031 100644 --- a/spacy/lang/ru/norm_exceptions.py +++ b/spacy/lang/ru/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang diff --git a/spacy/lang/ru/stop_words.py b/spacy/lang/ru/stop_words.py index 89069b3cf..16cb55ef9 100644 --- a/spacy/lang/ru/stop_words.py +++ b/spacy/lang/ru/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """ а diff --git a/spacy/lang/ru/tag_map.py b/spacy/lang/ru/tag_map.py index baf065588..294919811 100644 --- a/spacy/lang/ru/tag_map.py +++ b/spacy/lang/ru/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py index ea7b5b20d..df3169baf 100644 --- a/spacy/lang/ru/tokenizer_exceptions.py +++ b/spacy/lang/ru/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/si/__init__.py b/spacy/lang/si/__init__.py index a58a63f03..3b065860c 100644 --- a/spacy/lang/si/__init__.py +++ b/spacy/lang/si/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/si/examples.py b/spacy/lang/si/examples.py index 842dfdd7e..0ff00e76e 100644 --- a/spacy/lang/si/examples.py +++ b/spacy/lang/si/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/si/lex_attrs.py b/spacy/lang/si/lex_attrs.py index 5d5f06187..aa061852d 100644 --- a/spacy/lang/si/lex_attrs.py +++ b/spacy/lang/si/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/si/stop_words.py b/spacy/lang/si/stop_words.py index 8bbdec6b7..49723c860 100644 --- a/spacy/lang/si/stop_words.py +++ b/spacy/lang/si/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sk/__init__.py b/spacy/lang/sk/__init__.py index e7704196a..77a07e504 100644 --- a/spacy/lang/sk/__init__.py +++ b/spacy/lang/sk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sk/stop_words.py b/spacy/lang/sk/stop_words.py index f6994d33f..bd39b22f2 100644 --- a/spacy/lang/sk/stop_words.py +++ b/spacy/lang/sk/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-sk diff --git a/spacy/lang/sl/__init__.py b/spacy/lang/sl/__init__.py index 2d4977bdf..ce46e92dc 100644 --- a/spacy/lang/sl/__init__.py +++ b/spacy/lang/sl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sl/stop_words.py b/spacy/lang/sl/stop_words.py index 187e95876..c8596ad0b 100644 --- a/spacy/lang/sl/stop_words.py +++ b/spacy/lang/sl/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords-iso/stopwords-sl # TODO: probably needs to be tidied up – the list seems to have month names in diff --git a/spacy/lang/sq/__init__.py b/spacy/lang/sq/__init__.py index 6f33b37c2..034604838 100644 --- a/spacy/lang/sq/__init__.py +++ b/spacy/lang/sq/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from ...language import Language from ...attrs import LANG diff --git a/spacy/lang/sq/examples.py b/spacy/lang/sq/examples.py index c51a0da39..e1075f70a 100644 --- a/spacy/lang/sq/examples.py +++ b/spacy/lang/sq/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sq/stop_words.py b/spacy/lang/sq/stop_words.py index f91861ca1..58ee87d05 100644 --- a/spacy/lang/sq/stop_words.py +++ b/spacy/lang/sq/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/andrixh/index-albanian diff --git a/spacy/lang/sr/__init__.py b/spacy/lang/sr/__init__.py index f27b87102..151cc231c 100644 --- a/spacy/lang/sr/__init__.py +++ b/spacy/lang/sr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .norm_exceptions import NORM_EXCEPTIONS diff --git a/spacy/lang/sr/examples.py b/spacy/lang/sr/examples.py index d636220c3..1ac867f4c 100644 --- a/spacy/lang/sr/examples.py +++ b/spacy/lang/sr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sr/lex_attrs.py b/spacy/lang/sr/lex_attrs.py index c90dc0da7..dc48909bc 100644 --- a/spacy/lang/sr/lex_attrs.py +++ b/spacy/lang/sr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/sr/norm_exceptions.py b/spacy/lang/sr/norm_exceptions.py index 69f2c3173..add8350a0 100644 --- a/spacy/lang/sr/norm_exceptions.py +++ b/spacy/lang/sr/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Slang diff --git a/spacy/lang/sr/stop_words.py b/spacy/lang/sr/stop_words.py index 9712327f8..488c82a75 100644 --- a/spacy/lang/sr/stop_words.py +++ b/spacy/lang/sr/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sr/tokenizer_exceptions.py b/spacy/lang/sr/tokenizer_exceptions.py index 8fca346a3..82df15186 100755 --- a/spacy/lang/sr/tokenizer_exceptions.py +++ b/spacy/lang/sr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py index 671eefca0..d400eae4d 100644 --- a/spacy/lang/sv/__init__.py +++ b/spacy/lang/sv/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/sv/examples.py b/spacy/lang/sv/examples.py index 58e095195..98eee700b 100644 --- a/spacy/lang/sv/examples.py +++ b/spacy/lang/sv/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/sv/morph_rules.py b/spacy/lang/sv/morph_rules.py index 77744813f..8fca20a49 100644 --- a/spacy/lang/sv/morph_rules.py +++ b/spacy/lang/sv/morph_rules.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, PRON_LEMMA diff --git a/spacy/lang/sv/stop_words.py b/spacy/lang/sv/stop_words.py index 206abce5a..4d933a76d 100644 --- a/spacy/lang/sv/stop_words.py +++ b/spacy/lang/sv/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py index 7a82e6b59..021d5d2f5 100644 --- a/spacy/lang/sv/syntax_iterators.py +++ b/spacy/lang/sv/syntax_iterators.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import NOUN, PROPN, PRON diff --git a/spacy/lang/sv/tag_map.py b/spacy/lang/sv/tag_map.py index 7d4e29030..d4f5b6291 100644 --- a/spacy/lang/sv/tag_map.py +++ b/spacy/lang/sv/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV from ...symbols import ADP, X, VERB, NOUN, PROPN, PART, INTJ, PRON diff --git a/spacy/lang/sv/tokenizer_exceptions.py b/spacy/lang/sv/tokenizer_exceptions.py index dd0976aa6..834a088ad 100644 --- a/spacy/lang/sv/tokenizer_exceptions.py +++ b/spacy/lang/sv/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import LEMMA, NORM, ORTH, PRON_LEMMA, PUNCT, TAG _exc = {} diff --git a/spacy/lang/ta/__init__.py b/spacy/lang/ta/__init__.py index cb23339e6..d7a04afea 100644 --- a/spacy/lang/ta/__init__.py +++ b/spacy/lang/ta/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/ta/examples.py b/spacy/lang/ta/examples.py index 3ce3c3544..2590163cb 100644 --- a/spacy/lang/ta/examples.py +++ b/spacy/lang/ta/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ta/lex_attrs.py b/spacy/lang/ta/lex_attrs.py index 40158ad7a..f830f4ac9 100644 --- a/spacy/lang/ta/lex_attrs.py +++ b/spacy/lang/ta/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/ta/norm_exceptions.py b/spacy/lang/ta/norm_exceptions.py index fbdceb98c..8eaf0aa74 100644 --- a/spacy/lang/ta/norm_exceptions.py +++ b/spacy/lang/ta/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Regional words normal # Sri Lanka - wikipeadia diff --git a/spacy/lang/ta/stop_words.py b/spacy/lang/ta/stop_words.py index 91ebe8fd8..83410d65e 100644 --- a/spacy/lang/ta/stop_words.py +++ b/spacy/lang/ta/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Stop words diff --git a/spacy/lang/tag_map.py b/spacy/lang/tag_map.py index 3a744f180..5bff905bd 100644 --- a/spacy/lang/tag_map.py +++ b/spacy/lang/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/te/__init__.py b/spacy/lang/te/__init__.py index a4709177d..424164cc7 100644 --- a/spacy/lang/te/__init__.py +++ b/spacy/lang/te/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/te/examples.py b/spacy/lang/te/examples.py index 815ec8227..6162b231e 100644 --- a/spacy/lang/te/examples.py +++ b/spacy/lang/te/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/te/lex_attrs.py b/spacy/lang/te/lex_attrs.py index 6da766dca..ae11827f6 100644 --- a/spacy/lang/te/lex_attrs.py +++ b/spacy/lang/te/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/te/stop_words.py b/spacy/lang/te/stop_words.py index 11e157177..b18dab697 100644 --- a/spacy/lang/te/stop_words.py +++ b/spacy/lang/te/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/Xangis/extra-stopwords (MIT License) STOP_WORDS = set( diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py index 06970fbd7..950a77818 100644 --- a/spacy/lang/th/__init__.py +++ b/spacy/lang/th/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .tag_map import TAG_MAP from .stop_words import STOP_WORDS diff --git a/spacy/lang/th/lex_attrs.py b/spacy/lang/th/lex_attrs.py index 047d046c2..bc4e5293e 100644 --- a/spacy/lang/th/lex_attrs.py +++ b/spacy/lang/th/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/th/norm_exceptions.py b/spacy/lang/th/norm_exceptions.py index ed1b3e760..98b878308 100644 --- a/spacy/lang/th/norm_exceptions.py +++ b/spacy/lang/th/norm_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - _exc = { # Conjugation and Diversion invalid to Tonal form (ผันอักษรและเสียงไม่ตรงกับรูปวรรณยุกต์) diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 119a2f6a0..7fb12d538 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, NOUN, PRON, ADJ, ADV, INTJ, PROPN, DET, NUM, AUX, VERB from ...symbols import ADP, CCONJ, PART, PUNCT, SPACE, SCONJ diff --git a/spacy/lang/th/tokenizer_exceptions.py b/spacy/lang/th/tokenizer_exceptions.py index 4de0f1195..0529b3a99 100644 --- a/spacy/lang/th/tokenizer_exceptions.py +++ b/spacy/lang/th/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tl/__init__.py b/spacy/lang/tl/__init__.py index 30ad93139..f477029f7 100644 --- a/spacy/lang/tl/__init__.py +++ b/spacy/lang/tl/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/tl/lex_attrs.py b/spacy/lang/tl/lex_attrs.py index 61dc9d4f3..60bdc923b 100644 --- a/spacy/lang/tl/lex_attrs.py +++ b/spacy/lang/tl/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tl/stop_words.py b/spacy/lang/tl/stop_words.py index 510b3a418..2560cdaed 100644 --- a/spacy/lang/tl/stop_words.py +++ b/spacy/lang/tl/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - STOP_WORDS = set( """ akin diff --git a/spacy/lang/tl/tokenizer_exceptions.py b/spacy/lang/tl/tokenizer_exceptions.py index 77e1fb0c6..ea14746c4 100644 --- a/spacy/lang/tl/tokenizer_exceptions.py +++ b/spacy/lang/tl/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA diff --git a/spacy/lang/tokenizer_exceptions.py b/spacy/lang/tokenizer_exceptions.py index 3ea2bc3e9..13a1033a6 100644 --- a/spacy/lang/tokenizer_exceptions.py +++ b/spacy/lang/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import re from ..symbols import ORTH, POS, TAG, LEMMA, SPACE diff --git a/spacy/lang/tr/__init__.py b/spacy/lang/tr/__init__.py index 2553e7c0f..a29d78261 100644 --- a/spacy/lang/tr/__init__.py +++ b/spacy/lang/tr/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS diff --git a/spacy/lang/tr/examples.py b/spacy/lang/tr/examples.py index a0464dfe3..a14d87a46 100644 --- a/spacy/lang/tr/examples.py +++ b/spacy/lang/tr/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/tr/lex_attrs.py b/spacy/lang/tr/lex_attrs.py index 93f26fc8e..3dbc1833a 100644 --- a/spacy/lang/tr/lex_attrs.py +++ b/spacy/lang/tr/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/tr/stop_words.py b/spacy/lang/tr/stop_words.py index 65905499a..85dcff6a5 100644 --- a/spacy/lang/tr/stop_words.py +++ b/spacy/lang/tr/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # Source: https://github.com/stopwords-iso/stopwords-tr STOP_WORDS = set( """ diff --git a/spacy/lang/tr/tokenizer_exceptions.py b/spacy/lang/tr/tokenizer_exceptions.py index f48e035d4..97f524a87 100644 --- a/spacy/lang/tr/tokenizer_exceptions.py +++ b/spacy/lang/tr/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, NORM _exc = {"sağol": [{ORTH: "sağ"}, {ORTH: "ol", NORM: "olun"}]} diff --git a/spacy/lang/tt/__init__.py b/spacy/lang/tt/__init__.py index 3655e6264..80574a70d 100644 --- a/spacy/lang/tt/__init__.py +++ b/spacy/lang/tt/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_INFIXES from .stop_words import STOP_WORDS diff --git a/spacy/lang/tt/examples.py b/spacy/lang/tt/examples.py index ac668a0c2..723fcdd15 100644 --- a/spacy/lang/tt/examples.py +++ b/spacy/lang/tt/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. >>> from spacy.lang.tt.examples import sentences diff --git a/spacy/lang/tt/lex_attrs.py b/spacy/lang/tt/lex_attrs.py index ad3d6b9eb..a2ae03061 100644 --- a/spacy/lang/tt/lex_attrs.py +++ b/spacy/lang/tt/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/tt/punctuation.py b/spacy/lang/tt/punctuation.py index 9ee66a59e..f644a8ccb 100644 --- a/spacy/lang/tt/punctuation.py +++ b/spacy/lang/tt/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER, CONCAT_QUOTES, HYPHENS from ..char_classes import LIST_ELLIPSES, LIST_ICONS diff --git a/spacy/lang/tt/stop_words.py b/spacy/lang/tt/stop_words.py index 9f6e9bb86..44169b757 100644 --- a/spacy/lang/tt/stop_words.py +++ b/spacy/lang/tt/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Tatar stopwords are from https://github.com/aliiae/stopwords-tt STOP_WORDS = set( diff --git a/spacy/lang/tt/tokenizer_exceptions.py b/spacy/lang/tt/tokenizer_exceptions.py index 89f7a990b..efe9e1fc0 100644 --- a/spacy/lang/tt/tokenizer_exceptions.py +++ b/spacy/lang/tt/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, NORM _exc = {} diff --git a/spacy/lang/uk/__init__.py b/spacy/lang/uk/__init__.py index e74ff2d86..51165112a 100644 --- a/spacy/lang/uk/__init__.py +++ b/spacy/lang/uk/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS diff --git a/spacy/lang/uk/examples.py b/spacy/lang/uk/examples.py index 4f2b034eb..d17768ea6 100644 --- a/spacy/lang/uk/examples.py +++ b/spacy/lang/uk/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py index 3eeed5dd4..ff61d711f 100644 --- a/spacy/lang/uk/lemmatizer.py +++ b/spacy/lang/uk/lemmatizer.py @@ -1,4 +1,3 @@ -# coding: utf8 from ...symbols import ADJ, DET, NOUN, NUM, PRON, PROPN, PUNCT, VERB, POS from ...lemmatizer import Lemmatizer diff --git a/spacy/lang/uk/lex_attrs.py b/spacy/lang/uk/lex_attrs.py index 0ade751d6..510e5b85d 100644 --- a/spacy/lang/uk/lex_attrs.py +++ b/spacy/lang/uk/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM _num_words = [ diff --git a/spacy/lang/uk/stop_words.py b/spacy/lang/uk/stop_words.py index cdf24dd70..b11d7a044 100644 --- a/spacy/lang/uk/stop_words.py +++ b/spacy/lang/uk/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - STOP_WORDS = set( """а або diff --git a/spacy/lang/uk/tag_map.py b/spacy/lang/uk/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/uk/tag_map.py +++ b/spacy/lang/uk/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/uk/tokenizer_exceptions.py b/spacy/lang/uk/tokenizer_exceptions.py index a94d77af3..36f0b2e72 100644 --- a/spacy/lang/uk/tokenizer_exceptions.py +++ b/spacy/lang/uk/tokenizer_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import ORTH, LEMMA, POS, NORM, NOUN diff --git a/spacy/lang/ur/__init__.py b/spacy/lang/ur/__init__.py index 6eea0cf3b..c7f65adc3 100644 --- a/spacy/lang/ur/__init__.py +++ b/spacy/lang/ur/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from .punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/examples.py b/spacy/lang/ur/examples.py index f47c11600..7024483b5 100644 --- a/spacy/lang/ur/examples.py +++ b/spacy/lang/ur/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/ur/lex_attrs.py b/spacy/lang/ur/lex_attrs.py index 12d85be4b..e590ed3e3 100644 --- a/spacy/lang/ur/lex_attrs.py +++ b/spacy/lang/ur/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM # Source https://quizlet.com/4271889/1-100-urdu-number-wordsurdu-numerals-flash-cards/ diff --git a/spacy/lang/ur/punctuation.py b/spacy/lang/ur/punctuation.py index b8b1a1c83..5d35d0a25 100644 --- a/spacy/lang/ur/punctuation.py +++ b/spacy/lang/ur/punctuation.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..punctuation import TOKENIZER_SUFFIXES diff --git a/spacy/lang/ur/stop_words.py b/spacy/lang/ur/stop_words.py index 73c159d5c..abfa36497 100644 --- a/spacy/lang/ur/stop_words.py +++ b/spacy/lang/ur/stop_words.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - # Source: collected from different resource on internet STOP_WORDS = set( """ diff --git a/spacy/lang/ur/tag_map.py b/spacy/lang/ur/tag_map.py index 2499d7e3e..e0940edb7 100644 --- a/spacy/lang/ur/tag_map.py +++ b/spacy/lang/ur/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py index 425f84e3d..7496763ee 100644 --- a/spacy/lang/vi/__init__.py +++ b/spacy/lang/vi/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG, NORM from ..norm_exceptions import BASE_NORMS from ...language import Language diff --git a/spacy/lang/vi/lex_attrs.py b/spacy/lang/vi/lex_attrs.py index b6cd1188a..b3dbf2192 100644 --- a/spacy/lang/vi/lex_attrs.py +++ b/spacy/lang/vi/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LIKE_NUM diff --git a/spacy/lang/vi/stop_words.py b/spacy/lang/vi/stop_words.py index 13284dc59..1d2ecdf8d 100644 --- a/spacy/lang/vi/stop_words.py +++ b/spacy/lang/vi/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # Source: https://github.com/stopwords/vietnamese-stopwords STOP_WORDS = set( """ diff --git a/spacy/lang/vi/tag_map.py b/spacy/lang/vi/tag_map.py index 472e772ef..1ecbddc49 100644 --- a/spacy/lang/vi/tag_map.py +++ b/spacy/lang/vi/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..symbols import POS, ADV, NOUN, ADP, PRON, SCONJ, PROPN, DET, SYM, INTJ from ..symbols import PUNCT, NUM, AUX, X, CONJ, ADJ, VERB, PART, SPACE, CCONJ diff --git a/spacy/lang/xx/__init__.py b/spacy/lang/xx/__init__.py index 66d8c7917..2af650703 100644 --- a/spacy/lang/xx/__init__.py +++ b/spacy/lang/xx/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..tokenizer_exceptions import BASE_EXCEPTIONS from ..norm_exceptions import BASE_NORMS diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py index 38cd5e0cd..15f5c4ff8 100644 --- a/spacy/lang/xx/examples.py +++ b/spacy/lang/xx/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/__init__.py b/spacy/lang/yo/__init__.py index f227203cc..08e3166e1 100644 --- a/spacy/lang/yo/__init__.py +++ b/spacy/lang/yo/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .stop_words import STOP_WORDS from .lex_attrs import LEX_ATTRS from ..tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/lang/yo/examples.py b/spacy/lang/yo/examples.py index 170ddc803..9b875d09e 100644 --- a/spacy/lang/yo/examples.py +++ b/spacy/lang/yo/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/yo/lex_attrs.py b/spacy/lang/yo/lex_attrs.py index a9f1b85f6..ead68ced2 100644 --- a/spacy/lang/yo/lex_attrs.py +++ b/spacy/lang/yo/lex_attrs.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import unicodedata from ...attrs import LIKE_NUM diff --git a/spacy/lang/yo/stop_words.py b/spacy/lang/yo/stop_words.py index 53d382ad3..5c7a7fc45 100644 --- a/spacy/lang/yo/stop_words.py +++ b/spacy/lang/yo/stop_words.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - # stop words as whitespace-separated list. # Source: https://raw.githubusercontent.com/dohliam/more-stoplists/master/yo/yo.txt diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 8179b4551..e427dc6d2 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...attrs import LANG from ...language import Language from ...tokens import Doc diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py index b28215741..d0715eb0d 100644 --- a/spacy/lang/zh/examples.py +++ b/spacy/lang/zh/examples.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - """ Example sentences to test spaCy and its language models. diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py index 0b29c226e..08c8e3160 100644 --- a/spacy/lang/zh/lex_attrs.py +++ b/spacy/lang/zh/lex_attrs.py @@ -1,8 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals import re + from ...attrs import LIKE_NUM + _single_num_words = [ "〇", "一", diff --git a/spacy/lang/zh/stop_words.py b/spacy/lang/zh/stop_words.py index 0af4c1859..42ae4a1de 100644 --- a/spacy/lang/zh/stop_words.py +++ b/spacy/lang/zh/stop_words.py @@ -1,7 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - - # stop words as whitespace-separated list # Chinese stop words,maybe not enough STOP_WORDS = set( diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py index 41e2d2158..1ff0827be 100644 --- a/spacy/lang/zh/tag_map.py +++ b/spacy/lang/zh/tag_map.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE diff --git a/spacy/language.py b/spacy/language.py index 008b5559f..4a553bcaf 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,14 +1,7 @@ -# coding: utf8 -from __future__ import absolute_import, unicode_literals - import random import itertools - -from spacy.gold import Example -from spacy.util import minibatch import weakref import functools -from collections import OrderedDict from contextlib import contextmanager from copy import copy, deepcopy from thinc.neural import Model @@ -21,8 +14,7 @@ from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs -from .compat import izip, basestring_, is_python2, class_types -from .gold import GoldParse +from .gold import Example from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer from .attrs import IS_STOP, LANG @@ -32,7 +24,7 @@ from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .tokens import Doc from .lang.lex_attrs import LEX_ATTRS, is_stop -from .errors import Errors, Warnings, deprecation_warning, user_warning +from .errors import Errors, Warnings, deprecation_warning from . import util from . import about @@ -190,7 +182,7 @@ class Language(object): self._meta.setdefault("lang", self.lang) self._meta.setdefault("name", "model") self._meta.setdefault("version", "0.0.0") - self._meta.setdefault("spacy_version", ">={}".format(about.__version__)) + self._meta.setdefault("spacy_version", f">={about.__version__}") self._meta.setdefault("description", "") self._meta.setdefault("author", "") self._meta.setdefault("email", "") @@ -263,7 +255,7 @@ class Language(object): RETURNS (dict): Labels keyed by component name. """ - labels = OrderedDict() + labels = {} for name, pipe in self.pipeline: if hasattr(pipe, "labels"): labels[name] = list(pipe.labels) @@ -320,7 +312,7 @@ class Language(object): """ if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E004.format(component=component) raise ValueError(msg) if name is None: @@ -372,7 +364,7 @@ class Language(object): raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) if not hasattr(component, "__call__"): msg = Errors.E003.format(component=repr(component), name=name) - if isinstance(component, basestring_) and component in self.factories: + if isinstance(component, str) and component in self.factories: msg += Errors.E135.format(name=name) raise ValueError(msg) self.pipeline[self.pipe_names.index(name)] = (name, component) @@ -476,6 +468,7 @@ class Language(object): sgd = self._optimizer grads = {} + def get_grads(W, dW, key=None): grads[key] = (W, dW) @@ -725,9 +718,6 @@ class Language(object): """ # raw_texts will be used later to stop iterator. texts, raw_texts = itertools.tee(texts) - if is_python2 and n_process != 1: - user_warning(Warnings.W023) - n_process = 1 if n_threads != -1: deprecation_warning(Warnings.W016) if n_process == -1: @@ -744,7 +734,7 @@ class Language(object): component_cfg=component_cfg, as_example=False ) - for doc, context in izip(docs, contexts): + for doc, context in zip(docs, contexts): yield (doc, context) return if component_cfg is None: @@ -814,7 +804,7 @@ class Language(object): *[mp.Pipe(False) for _ in range(n_process)] ) - batch_texts = minibatch(texts, batch_size) + batch_texts = util.minibatch(texts, batch_size) # Sender sends texts to the workers. # This is necessary to properly handle infinite length of texts. # (In this case, all data cannot be sent to the workers at once) @@ -858,7 +848,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - serializers = OrderedDict() + serializers = {} serializers["tokenizer"] = lambda p: self.tokenizer.to_disk( p, exclude=["vocab"] ) @@ -891,7 +881,7 @@ class Language(object): deprecation_warning(Warnings.W014) exclude = disable path = util.ensure_path(path) - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p)) deserializers["vocab"] = lambda p: self.vocab.from_disk( p @@ -925,7 +915,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - serializers = OrderedDict() + serializers = {} serializers["vocab"] = lambda: self.vocab.to_bytes() serializers["tokenizer"] = lambda: self.tokenizer.to_bytes(exclude=["vocab"]) serializers["meta.json"] = lambda: srsly.json_dumps(self.meta) @@ -950,7 +940,7 @@ class Language(object): if disable is not None: deprecation_warning(Warnings.W014) exclude = disable - deserializers = OrderedDict() + deserializers = {} deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b)) deserializers["vocab"] = lambda b: self.vocab.from_bytes( b @@ -1009,7 +999,7 @@ class component(object): def factory(nlp, **cfg): if hasattr(obj, "from_nlp"): return obj.from_nlp(nlp, **cfg) - elif isinstance(obj, class_types): + elif isinstance(obj, type): return obj() return obj diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d70e4cfc4..3ba86c169 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,8 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import OrderedDict - from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN from .errors import Errors from .lookups import Lookups @@ -160,7 +155,7 @@ class Lemmatizer(object): else: oov_forms.append(form) # Remove duplicates but preserve the ordering of applied "rules" - forms = list(OrderedDict.fromkeys(forms)) + forms = list(dict.fromkeys(forms)) # Put exceptions at the front of the list, so they get priority. # This is a dodgy heuristic -- but it's the best we can do until we get # frequencies on this. We can at least prune out problematic exceptions, diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 5c981bc25..497e20516 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,7 +1,4 @@ # cython: embedsignature=True -# coding: utf8 -from __future__ import unicode_literals, print_function - # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray from libc.string cimport memset diff --git a/spacy/lookups.py b/spacy/lookups.py index bf250b4b4..a9d371b79 100644 --- a/spacy/lookups.py +++ b/spacy/lookups.py @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import srsly -from collections import OrderedDict from preshed.bloom import BloomFilter +from collections import OrderedDict from .errors import Errors from .util import SimpleFrozenDict, ensure_path @@ -28,7 +25,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#init """ - self._tables = OrderedDict() + self._tables = {} def __contains__(self, name): """Check if the lookups contain a table of a given name. Delegates to @@ -118,7 +115,7 @@ class Lookups(object): DOCS: https://spacy.io/api/lookups#from_bytes """ - self._tables = OrderedDict() + self._tables = {} for key, value in srsly.msgpack_loads(bytes_data).items(): self._tables[key] = Table(key) self._tables[key].update(value) @@ -254,12 +251,12 @@ class Table(OrderedDict): DOCS: https://spacy.io/api/lookups#table.to_bytes """ - data = [ - ("name", self.name), - ("dict", dict(self.items())), - ("bloom", self.bloom.to_bytes()), - ] - return srsly.msgpack_dumps(OrderedDict(data)) + data = { + "name": self.name, + "dict": dict(self.items()), + "bloom": self.bloom.to_bytes(), + } + return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data): """Load a table from a bytestring. diff --git a/spacy/matcher/__init__.py b/spacy/matcher/__init__.py index 91874ed43..286844787 100644 --- a/spacy/matcher/__init__.py +++ b/spacy/matcher/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .matcher import Matcher from .phrasematcher import PhraseMatcher from .dependencymatcher import DependencyMatcher diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py index 1b10f0dd5..ce6379c45 100644 --- a/spacy/matcher/_schemas.py +++ b/spacy/matcher/_schemas.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - TOKEN_PATTERN_SCHEMA = { "$schema": "http://json-schema.org/draft-06/schema", diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 56d27024d..46cff0d0c 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 30ef3dd36..2908ab0c2 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libcpp.vector cimport vector from libc.stdint cimport int32_t from cymem.cymem cimport Pool diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx index 4de5782f9..20f45b9e4 100644 --- a/spacy/matcher/phrasematcher.pyx +++ b/spacy/matcher/phrasematcher.pyx @@ -1,7 +1,5 @@ # cython: infer_types=True # cython: profile=True -from __future__ import unicode_literals - from libc.stdint cimport uintptr_t from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py index 57e7ef571..8eebf0564 100644 --- a/spacy/ml/__init__.py +++ b/spacy/ml/__init__.py @@ -1,5 +1,2 @@ -# coding: utf8 -from __future__ import unicode_literals - from .tok2vec import Tok2Vec # noqa: F401 from .common import FeedForward, LayerNormalizedMaxout # noqa: F401 diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py index b077a46b7..e7baae380 100644 --- a/spacy/ml/_legacy_tok2vec.py +++ b/spacy/ml/_legacy_tok2vec.py @@ -1,5 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals from thinc.v2v import Model, Maxout from thinc.i2v import HashEmbed, StaticVectors from thinc.t2t import ExtractWindow diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py index fa271b37c..2b1144fcb 100644 --- a/spacy/ml/_wire.py +++ b/spacy/ml/_wire.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from thinc.api import layerize, wrap, noop, chain, concatenate from thinc.v2v import Model diff --git a/spacy/ml/common.py b/spacy/ml/common.py index f90b53a15..4ecb00e4e 100644 --- a/spacy/ml/common.py +++ b/spacy/ml/common.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from thinc.api import chain from thinc.v2v import Maxout from thinc.misc import LayerNorm diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py index 8f86475ef..9a0ed6bf5 100644 --- a/spacy/ml/tok2vec.py +++ b/spacy/ml/tok2vec.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued from thinc.api import noop, with_square_sequences from thinc.v2v import Maxout, Model diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index c146094a9..f12691170 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,12 +1,8 @@ # cython: infer_types -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memset import srsly from collections import Counter -from .compat import basestring_ from .strings import get_string_id from . import symbols from .attrs cimport POS, IS_SPACE @@ -190,7 +186,7 @@ cdef class Morphology: present. Returns the hash of the new analysis. """ for f in features: - if isinstance(f, basestring_): + if isinstance(f, str): self.strings.add(f) string_features = features features = intify_features(features) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 3925a6738..e71fb917f 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - IDS = { "": NO_TAG, diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py index de8403152..2f9824eda 100644 --- a/spacy/pipeline/__init__.py +++ b/spacy/pipeline/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .pipes import SentenceRecognizer diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 2db312d64..75120dfe6 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -1,12 +1,8 @@ -# coding: utf8 -from __future__ import unicode_literals - -from collections import defaultdict, OrderedDict +from collections import defaultdict import srsly from ..language import component from ..errors import Errors -from ..compat import basestring_ from ..util import ensure_path, to_disk, from_disk from ..tokens import Span from ..matcher import Matcher, PhraseMatcher @@ -201,7 +197,7 @@ class EntityRuler(object): self._ent_ids[key] = (ent_label, entry["id"]) pattern = entry["pattern"] - if isinstance(pattern, basestring_): + if isinstance(pattern, str): self.phrase_patterns[label].append(self.nlp(pattern)) elif isinstance(pattern, list): self.token_patterns[label].append(pattern) @@ -230,8 +226,8 @@ class EntityRuler(object): RETURNS (str): The ent_label joined with configured `ent_id_sep` """ - if isinstance(ent_id, basestring_): - label = "{}{}{}".format(label, self.ent_id_sep, ent_id) + if isinstance(ent_id, str): + label = f"{label}{self.ent_id_sep}{ent_id}" return label def from_bytes(self, patterns_bytes, **kwargs): @@ -264,15 +260,12 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#to_bytes """ - - serial = OrderedDict( - ( - ("overwrite", self.overwrite), - ("ent_id_sep", self.ent_id_sep), - ("phrase_matcher_attr", self.phrase_matcher_attr), - ("patterns", self.patterns), - ) - ) + serial = { + "overwrite": self.overwrite, + "ent_id_sep": self.ent_id_sep, + "phrase_matcher_attr": self.phrase_matcher_attr, + "patterns": self.patterns, + } return srsly.msgpack_dumps(serial) def from_disk(self, path, **kwargs): diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 69e638da2..6e9d4197c 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from ..language import component from ..matcher import Matcher from ..util import filter_spans diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index b61a34c0e..68385c5a9 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural._classes.difference import Siamese, CauchySimilarity diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index adcff9280..10038d410 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -1,5 +1,4 @@ -from __future__ import unicode_literals -from collections import OrderedDict, defaultdict +from collections import defaultdict import numpy cimport numpy as np @@ -13,7 +12,6 @@ from .._ml import Tok2Vec, build_morphologizer_model from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import create_default_optimizer from ..errors import Errors, TempErrors -from ..compat import basestring_ from ..tokens.doc cimport Doc from ..vocab cimport Vocab from ..morphology cimport Morphology @@ -32,7 +30,7 @@ class Morphologizer(Pipe): def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault('cnn_maxout_pieces', 2) self._class_map = self.vocab.morphology.create_class_map() diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index b041e2441..ff88340cd 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1,12 +1,8 @@ # cython: infer_types=True # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals - import numpy import srsly import random -from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm @@ -24,7 +20,6 @@ from .functions import merge_subtokens from ..language import Language, component from ..syntax import nonproj from ..gold import Example -from ..compat import basestring_ from ..attrs import POS, ID from ..parts_of_speech import X from ..kb import KnowledgeBase @@ -183,7 +178,7 @@ class Pipe(object): exclude (list): String names of serialization fields to exclude. RETURNS (bytes): The serialized object. """ - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) if self.model not in (True, False, None): serialize["model"] = self.model.to_bytes @@ -206,7 +201,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) if hasattr(self, "vocab"): deserialize["vocab"] = lambda b: self.vocab.from_bytes(b) @@ -217,7 +212,7 @@ class Pipe(object): def to_disk(self, path, exclude=tuple(), **kwargs): """Serialize the pipe to disk.""" - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) if self.model not in (None, True, False): @@ -239,7 +234,7 @@ class Pipe(object): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["model"] = load_model @@ -409,7 +404,7 @@ class Tagger(Pipe): self.vocab = vocab self.model = model self._rehearsal_model = None - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) @property @@ -564,7 +559,7 @@ class Tagger(Pipe): if not any(table in self.vocab.lookups for table in lemma_tables): user_warning(Warnings.W022) orig_tag_map = dict(self.vocab.morphology.tag_map) - new_tag_map = OrderedDict() + new_tag_map = {} for example in get_examples(): for tag in example.token_annotation.tags: if tag in orig_tag_map: @@ -594,7 +589,7 @@ class Tagger(Pipe): return build_tagger_model(n_tags, **cfg) def add_label(self, label, values=None): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -624,12 +619,12 @@ class Tagger(Pipe): yield def to_bytes(self, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) serialize["tag_map"] = lambda: srsly.msgpack_dumps(tag_map) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) return util.to_bytes(serialize, exclude) @@ -656,24 +651,24 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("tag_map", load_tag_map), - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("model", lambda b: load_model(b)), - )) + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "tag_map": load_tag_map, + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, exclude=tuple(), **kwargs): - tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items())) - serialize = OrderedDict(( - ("vocab", lambda p: self.vocab.to_disk(p)), - ("tag_map", lambda p: srsly.write_msgpack(p, tag_map)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), - ("cfg", lambda p: srsly.write_json(p, self.cfg)) - )) + tag_map = dict(sorted(self.vocab.morphology.tag_map.items())) + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "tag_map": lambda p: srsly.write_msgpack(p, tag_map), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -697,12 +692,12 @@ class Tagger(Pipe): lemmatizer=self.vocab.morphology.lemmatizer, exc=self.vocab.morphology.exc) - deserialize = OrderedDict(( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("tag_map", load_tag_map), - ("model", load_model), - )) + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "tag_map": load_tag_map, + "model": load_model, + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -719,7 +714,7 @@ class SentenceRecognizer(Tagger): self.vocab = vocab self.model = model self._rehearsal_model = None - self.cfg = OrderedDict(sorted(cfg.items())) + self.cfg = dict(sorted(cfg.items())) self.cfg.setdefault("cnn_maxout_pieces", 2) self.cfg.setdefault("subword_features", True) self.cfg.setdefault("token_vector_width", 12) @@ -816,7 +811,7 @@ class SentenceRecognizer(Tagger): yield def to_bytes(self, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} if self.model not in (None, True, False): serialize["model"] = self.model.to_bytes serialize["vocab"] = self.vocab.to_bytes @@ -833,21 +828,21 @@ class SentenceRecognizer(Tagger): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("cfg", lambda b: self.cfg.update(srsly.json_loads(b))), - ("model", lambda b: load_model(b)), - )) + deserialize = { + "vocab": lambda b: self.vocab.from_bytes(b), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: load_model(b), + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_bytes(bytes_data, deserialize, exclude) return self def to_disk(self, path, exclude=tuple(), **kwargs): - serialize = OrderedDict(( - ("vocab", lambda p: self.vocab.to_disk(p)), - ("model", lambda p: p.open("wb").write(self.model.to_bytes())), - ("cfg", lambda p: srsly.write_json(p, self.cfg)) - )) + serialize = { + "vocab": lambda p: self.vocab.to_disk(p), + "model": lambda p: p.open("wb").write(self.model.to_bytes()), + "cfg": lambda p: srsly.write_json(p, self.cfg) + } exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) @@ -861,11 +856,11 @@ class SentenceRecognizer(Tagger): except AttributeError: raise ValueError(Errors.E149) - deserialize = OrderedDict(( - ("cfg", lambda p: self.cfg.update(_load_cfg(p))), - ("vocab", lambda p: self.vocab.from_disk(p)), - ("model", load_model), - )) + deserialize = { + "cfg": lambda p: self.cfg.update(_load_cfg(p)), + "vocab": lambda p: self.vocab.from_disk(p), + "model": load_model, + } exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self @@ -1241,7 +1236,7 @@ class TextCategorizer(Pipe): return float(mean_square_error), d_scores def add_label(self, label): - if not isinstance(label, basestring_): + if not isinstance(label, str): raise ValueError(Errors.E187) if label in self.labels: return 0 @@ -1614,7 +1609,7 @@ class EntityLinker(Pipe): token.ent_kb_id_ = kb_id def to_disk(self, path, exclude=tuple(), **kwargs): - serialize = OrderedDict() + serialize = {} serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) @@ -1637,7 +1632,7 @@ class EntityLinker(Pipe): kb.load_bulk(p) self.set_kb(kb) - deserialize = OrderedDict() + deserialize = {} deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) deserialize["vocab"] = lambda p: self.vocab.from_disk(p) deserialize["kb"] = load_kb diff --git a/spacy/scorer.py b/spacy/scorer.py index 6238b6ead..82b10a77d 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import division, print_function, unicode_literals - import numpy as np from .gold import tags_to_entities, GoldParse, DocAnnotation diff --git a/spacy/strings.pyx b/spacy/strings.pyx index f3457e1a5..0605de96c 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals, absolute_import - cimport cython from libc.string cimport memcpy from libcpp.set cimport set @@ -9,7 +6,6 @@ from libc.stdint cimport uint32_t from murmurhash.mrmr cimport hash64, hash32 import srsly -from .compat import basestring_ from .symbols import IDS as SYMBOLS_BY_STR from .symbols import NAMES as SYMBOLS_BY_INT from .typedefs cimport hash_t @@ -24,7 +20,7 @@ def get_string_id(key): This function optimises for convenience over performance, so shouldn't be used in tight loops. """ - if not isinstance(key, basestring_): + if not isinstance(key, str): return key elif key in SYMBOLS_BY_STR: return SYMBOLS_BY_STR[key] @@ -150,7 +146,7 @@ cdef class StringStore: return key else: return self[key] - + def add(self, string): """Add a string to the StringStore. diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index b65ae9628..85f23ccbc 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,8 +1,4 @@ -# coding: utf8 -#cython: optimize.unpack_method_calls=False -from __future__ import unicode_literals - - +# cython: optimize.unpack_method_calls=False IDS = { "": NIL, "IS_ALPHA": IS_ALPHA, diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx index 8b6448a46..19d05e77f 100644 --- a/spacy/syntax/_parser_model.pyx +++ b/spacy/syntax/_parser_model.pyx @@ -1,10 +1,6 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict import numpy cimport cython.parallel import numpy.random @@ -249,7 +245,7 @@ class ParserModel(Model): def resize_output(self, new_output): if len(self._layers) == 2: - return + return if new_output == self.upper.nO: return smaller = self.upper @@ -485,7 +481,7 @@ cdef class precompute_hiddens: ops = NumpyOps() else: ops = CupyOps() - + if self.activation == "maxout": state_vector, mask = ops.maxout(state_vector) else: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 45fd1170b..5ec169428 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,12 +1,9 @@ # cython: profile=True # cython: cdivision=True # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool -from collections import OrderedDict, defaultdict, Counter +from collections import defaultdict, Counter from thinc.extra.search cimport Beam import json @@ -25,7 +22,7 @@ from ..tokens.doc cimport Doc, set_children_from_heads # Calculate cost as gold/not gold. We don't use scalar value anyway. cdef int BINARY_COSTS = 1 cdef weight_t MIN_SCORE = -90000 -cdef attr_t SUBTOK_LABEL = hash_string('subtok') +cdef attr_t SUBTOK_LABEL = hash_string(u'subtok') DEF NON_MONOTONIC = True DEF USE_BREAK = True diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 7467aa342..5dfa20b7d 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,9 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter from .stateclass cimport StateClass from ._state cimport StateC diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index c98baf6fd..14d9e54d4 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,10 +1,6 @@ # cython: infer_types=True # cython: cdivision=True # cython: boundscheck=False -# coding: utf-8 -from __future__ import unicode_literals, print_function - -from collections import OrderedDict import numpy cimport cython.parallel import numpy.random @@ -692,22 +688,22 @@ cdef class Parser: return self def to_bytes(self, exclude=tuple(), **kwargs): - serializers = OrderedDict(( - ('model', lambda: (self.model.to_bytes() if self.model is not True else True)), - ('vocab', lambda: self.vocab.to_bytes()), - ('moves', lambda: self.moves.to_bytes(exclude=["strings"])), - ('cfg', lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True)) - )) + serializers = { + "model": lambda: (self.model.to_bytes() if self.model is not True else True), + "vocab": lambda: self.vocab.to_bytes(), + "moves": lambda: self.moves.to_bytes(exclude=["strings"]), + "cfg": lambda: srsly.json_dumps(self.cfg, indent=2, sort_keys=True) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): - deserializers = OrderedDict(( - ('vocab', lambda b: self.vocab.from_bytes(b)), - ('moves', lambda b: self.moves.from_bytes(b, exclude=["strings"])), - ('cfg', lambda b: self.cfg.update(srsly.json_loads(b))), - ('model', lambda b: None) - )) + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "moves": lambda b: self.moves.from_bytes(b, exclude=["strings"]), + "cfg": lambda b: self.cfg.update(srsly.json_loads(b)), + "model": lambda b: None + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) if 'model' not in exclude: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 2ec6b61ac..0f738f99f 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,12 +1,9 @@ -# coding: utf-8 # cython: profile=True # cython: infer_types=True """Implements the projectivize/deprojectivize mechanism in Nivre & Nilsson 2005 for doing pseudo-projective parsing implementation uses the HEAD decoration scheme. """ -from __future__ import unicode_literals - from copy import copy from spacy.gold import Example diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 2a15a2de1..47b37946c 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,7 +1,4 @@ -# coding: utf-8 # cython: infer_types=True -from __future__ import unicode_literals - import numpy from ..tokens.doc cimport Doc diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 65097f114..62e369091 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,12 +1,9 @@ # cython: infer_types=True -# coding: utf-8 -from __future__ import unicode_literals - from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from thinc.extra.search cimport Beam -from collections import OrderedDict, Counter +from collections import Counter import srsly from . cimport _beam_utils diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 816970e61..ba7b67e25 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class diff --git a/spacy/tests/doc/test_add_entities.py b/spacy/tests/doc/test_add_entities.py index 6c69e699a..766dcb739 100644 --- a/spacy/tests/doc/test_add_entities.py +++ b/spacy/tests/doc/test_add_entities.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.pipeline import EntityRecognizer from spacy.tokens import Span import pytest diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index 7b513cfab..6be6e3867 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.attrs import ORTH, SHAPE, POS, DEP diff --git a/spacy/tests/doc/test_creation.py b/spacy/tests/doc/test_creation.py index 120fb6e28..d986d160c 100644 --- a/spacy/tests/doc/test_creation.py +++ b/spacy/tests/doc/test_creation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 86c7fbf72..41a060b7b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy diff --git a/spacy/tests/doc/test_morphanalysis.py b/spacy/tests/doc/test_morphanalysis.py index 5d570af53..67ebc06d6 100644 --- a/spacy/tests/doc/test_morphanalysis.py +++ b/spacy/tests/doc/test_morphanalysis.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/doc/test_pickle_doc.py b/spacy/tests/doc/test_pickle_doc.py index 2b6970a38..28cb66714 100644 --- a/spacy/tests/doc/test_pickle_doc.py +++ b/spacy/tests/doc/test_pickle_doc.py @@ -1,8 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.language import Language -from spacy.compat import pickle, unicode_ +from spacy.compat import pickle def test_pickle_single_doc(): @@ -16,9 +13,9 @@ def test_pickle_single_doc(): def test_list_of_docs_pickles_efficiently(): nlp = Language() for i in range(10000): - _ = nlp.vocab[unicode_(i)] # noqa: F841 + _ = nlp.vocab[str(i)] # noqa: F841 one_pickled = pickle.dumps(nlp("0"), -1) - docs = list(nlp.pipe(unicode_(i) for i in range(100))) + docs = list(nlp.pipe(str(i) for i in range(100))) many_pickled = pickle.dumps(docs, -1) assert len(many_pickled) < (len(one_pickled) * 2) many_unpickled = pickle.loads(many_pickled) diff --git a/spacy/tests/doc/test_retokenize_merge.py b/spacy/tests/doc/test_retokenize_merge.py index 5bdf78f39..c82c04eeb 100644 --- a/spacy/tests/doc/test_retokenize_merge.py +++ b/spacy/tests/doc/test_retokenize_merge.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA from spacy.vocab import Vocab diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py index d074fddc6..33b6fbe81 100644 --- a/spacy/tests/doc/test_retokenize_split.py +++ b/spacy/tests/doc/test_retokenize_split.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokens import Doc, Token diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index 01bb93c50..9fb552d44 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import ORTH, LENGTH from spacy.tokens import Doc, Span diff --git a/spacy/tests/doc/test_to_json.py b/spacy/tests/doc/test_to_json.py index a063a6569..18243c306 100644 --- a/spacy/tests/doc/test_to_json.py +++ b/spacy/tests/doc/test_to_json.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.cli._schemas import TRAINING_SCHEMA from spacy.util import get_json_validator, validate_json diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index bff2a95c6..cff1d3327 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP diff --git a/spacy/tests/doc/test_underscore.py b/spacy/tests/doc/test_underscore.py index 2877bfeea..352460581 100644 --- a/spacy/tests/doc/test_underscore.py +++ b/spacy/tests/doc/test_underscore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.tokens import Doc, Span, Token diff --git a/spacy/tests/lang/ar/test_exceptions.py b/spacy/tests/lang/ar/test_exceptions.py index 3cfc380d2..125220caf 100644 --- a/spacy/tests/lang/ar/test_exceptions.py +++ b/spacy/tests/lang/ar/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ar/test_text.py b/spacy/tests/lang/ar/test_text.py index 109c3721a..f4a8cc1e3 100644 --- a/spacy/tests/lang/ar/test_text.py +++ b/spacy/tests/lang/ar/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - def test_ar_tokenizer_handles_long_text(ar_tokenizer): text = """نجيب محفوظ مؤلف و كاتب روائي عربي، يعد من أهم الأدباء العرب خلال القرن العشرين. diff --git a/spacy/tests/lang/bn/test_tokenizer.py b/spacy/tests/lang/bn/test_tokenizer.py index 62dd52778..5b18c5269 100644 --- a/spacy/tests/lang/bn/test_tokenizer.py +++ b/spacy/tests/lang/bn/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_exception.py b/spacy/tests/lang/ca/test_exception.py index 56156c328..71098f094 100644 --- a/spacy/tests/lang/ca/test_exception.py +++ b/spacy/tests/lang/ca/test_exception.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_prefix_suffix_infix.py b/spacy/tests/lang/ca/test_prefix_suffix_infix.py index 4583a62b9..83a75f056 100644 --- a/spacy/tests/lang/ca/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ca/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py index 1506016d4..38f5fc708 100644 --- a/spacy/tests/lang/ca/test_text.py +++ b/spacy/tests/lang/ca/test_text.py @@ -1,10 +1,4 @@ -# coding: utf-8 - """Test that longer and mixed texts are tokenized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index a522ab5e8..603378ea7 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_prefix_suffix_infix.py b/spacy/tests/lang/da/test_prefix_suffix_infix.py index 8b43bf360..e36b3cdb9 100644 --- a/spacy/tests/lang/da/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/da/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/da/test_text.py b/spacy/tests/lang/da/test_text.py index 07b134e2d..3c6cca5ac 100644 --- a/spacy/tests/lang/da/test_text.py +++ b/spacy/tests/lang/da/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.da.lex_attrs import like_num diff --git a/spacy/tests/lang/de/test_exceptions.py b/spacy/tests/lang/de/test_exceptions.py index 2e065870e..a4614f6c4 100644 --- a/spacy/tests/lang/de/test_exceptions.py +++ b/spacy/tests/lang/de/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_parser.py b/spacy/tests/lang/de/test_parser.py index 5c8694da3..c897dcf2f 100644 --- a/spacy/tests/lang/de/test_parser.py +++ b/spacy/tests/lang/de/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/de/test_prefix_suffix_infix.py b/spacy/tests/lang/de/test_prefix_suffix_infix.py index 13e109395..82bd8ed69 100644 --- a/spacy/tests/lang/de/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/de/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/de/test_text.py b/spacy/tests/lang/de/test_text.py index b3fb1eaa5..22711763e 100644 --- a/spacy/tests/lang/de/test_text.py +++ b/spacy/tests/lang/de/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_exception.py b/spacy/tests/lang/el/test_exception.py index b8d10fb69..a4656ea98 100644 --- a/spacy/tests/lang/el/test_exception.py +++ b/spacy/tests/lang/el/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/el/test_text.py b/spacy/tests/lang/el/test_text.py index a6395ab4a..1b3ef6182 100644 --- a/spacy/tests/lang/el/test_text.py +++ b/spacy/tests/lang/el/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_customized_tokenizer.py b/spacy/tests/lang/en/test_customized_tokenizer.py index 7f939011f..f5302cb31 100644 --- a/spacy/tests/lang/en/test_customized_tokenizer.py +++ b/spacy/tests/lang/en/test_customized_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.lang.en import English diff --git a/spacy/tests/lang/en/test_exceptions.py b/spacy/tests/lang/en/test_exceptions.py index 6285a9408..b2e941dab 100644 --- a/spacy/tests/lang/en/test_exceptions.py +++ b/spacy/tests/lang/en/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_indices.py b/spacy/tests/lang/en/test_indices.py index 8a7bc0323..d50c75fc5 100644 --- a/spacy/tests/lang/en/test_indices.py +++ b/spacy/tests/lang/en/test_indices.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - def test_en_simple_punct(en_tokenizer): text = "to walk, do foo" diff --git a/spacy/tests/lang/en/test_noun_chunks.py b/spacy/tests/lang/en/test_noun_chunks.py index 7dc47f9cc..6739b5137 100644 --- a/spacy/tests/lang/en/test_noun_chunks.py +++ b/spacy/tests/lang/en/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy from spacy.attrs import HEAD, DEP from spacy.symbols import nsubj, dobj, amod, nmod, conj, cc, root diff --git a/spacy/tests/lang/en/test_parser.py b/spacy/tests/lang/en/test_parser.py index ce696bc25..057143696 100644 --- a/spacy/tests/lang/en/test_parser.py +++ b/spacy/tests/lang/en/test_parser.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py index 3dccd6bcf..8c9c58fea 100644 --- a/spacy/tests/lang/en/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/en/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/en/test_punct.py b/spacy/tests/lang/en/test_punct.py index 61274cf14..4dc6ddfe4 100644 --- a/spacy/tests/lang/en/test_punct.py +++ b/spacy/tests/lang/en/test_punct.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import compile_prefix_regex from spacy.lang.punctuation import TOKENIZER_PREFIXES diff --git a/spacy/tests/lang/en/test_sbd.py b/spacy/tests/lang/en/test_sbd.py index 40bd110e8..ba7b2f2cf 100644 --- a/spacy/tests/lang/en/test_sbd.py +++ b/spacy/tests/lang/en/test_sbd.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc, apply_transition_sequence diff --git a/spacy/tests/lang/en/test_tagger.py b/spacy/tests/lang/en/test_tagger.py index 567fd5a44..d9eced2ff 100644 --- a/spacy/tests/lang/en/test_tagger.py +++ b/spacy/tests/lang/en/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ...util import get_doc diff --git a/spacy/tests/lang/en/test_text.py b/spacy/tests/lang/en/test_text.py index a7ebde989..c5d56d885 100644 --- a/spacy/tests/lang/en/test_text.py +++ b/spacy/tests/lang/en/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en.lex_attrs import like_num diff --git a/spacy/tests/lang/es/test_exception.py b/spacy/tests/lang/es/test_exception.py index 8d6164058..90d897a4c 100644 --- a/spacy/tests/lang/es/test_exception.py +++ b/spacy/tests/lang/es/test_exception.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/es/test_text.py b/spacy/tests/lang/es/test_text.py index acd572b48..af7b0212d 100644 --- a/spacy/tests/lang/es/test_text.py +++ b/spacy/tests/lang/es/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_text.py b/spacy/tests/lang/fi/test_text.py index 2dd92597e..dbb67ad7a 100644 --- a/spacy/tests/lang/fi/test_text.py +++ b/spacy/tests/lang/fi/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fi/test_tokenizer.py b/spacy/tests/lang/fi/test_tokenizer.py index 17f6f0ccc..6d5a14e6e 100644 --- a/spacy/tests/lang/fi/test_tokenizer.py +++ b/spacy/tests/lang/fi/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_exceptions.py b/spacy/tests/lang/fr/test_exceptions.py index 93dbf0993..98d318f6e 100644 --- a/spacy/tests/lang/fr/test_exceptions.py +++ b/spacy/tests/lang/fr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/fr/test_prefix_suffix_infix.py b/spacy/tests/lang/fr/test_prefix_suffix_infix.py index ca6bdbd87..01d50b0a6 100644 --- a/spacy/tests/lang/fr/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/fr/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.lang.punctuation import TOKENIZER_INFIXES diff --git a/spacy/tests/lang/fr/test_text.py b/spacy/tests/lang/fr/test_text.py index 24b4c4532..01231f593 100644 --- a/spacy/tests/lang/fr/test_text.py +++ b/spacy/tests/lang/fr/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.fr.lex_attrs import like_num diff --git a/spacy/tests/lang/ga/test_tokenizer.py b/spacy/tests/lang/ga/test_tokenizer.py index 29bc1c759..78127ef7c 100644 --- a/spacy/tests/lang/ga/test_tokenizer.py +++ b/spacy/tests/lang/ga/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/he/test_tokenizer.py b/spacy/tests/lang/he/test_tokenizer.py index f138ec6e7..3131014a3 100644 --- a/spacy/tests/lang/he/test_tokenizer.py +++ b/spacy/tests/lang/he/test_tokenizer.py @@ -1,6 +1,3 @@ -# encoding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/hu/test_tokenizer.py b/spacy/tests/lang/hu/test_tokenizer.py index fa8e132c0..4ec720c60 100644 --- a/spacy/tests/lang/hu/test_tokenizer.py +++ b/spacy/tests/lang/hu/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_prefix_suffix_infix.py b/spacy/tests/lang/id/test_prefix_suffix_infix.py index e86a98ee3..2a81dab01 100644 --- a/spacy/tests/lang/id/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/id/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/id/test_text.py b/spacy/tests/lang/id/test_text.py index 915d268ae..ed6487b68 100644 --- a/spacy/tests/lang/id/test_text.py +++ b/spacy/tests/lang/id/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.id.lex_attrs import like_num diff --git a/spacy/tests/lang/it/test_prefix_suffix_infix.py b/spacy/tests/lang/it/test_prefix_suffix_infix.py index f84351fd7..46f66b5e6 100644 --- a/spacy/tests/lang/it/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/it/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_lemmatization.py b/spacy/tests/lang/ja/test_lemmatization.py index cfff0fcfe..4cb3110b3 100644 --- a/spacy/tests/lang/ja/test_lemmatization.py +++ b/spacy/tests/lang/ja/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ja/test_tokenizer.py b/spacy/tests/lang/ja/test_tokenizer.py index ad8bfaa00..481f346bb 100644 --- a/spacy/tests/lang/ja/test_tokenizer.py +++ b/spacy/tests/lang/ja/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 42c306c11..7782ca4bc 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index b8fe7959c..eac309857 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # fmt: off diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py index 7ca2394b7..5b5005ae7 100644 --- a/spacy/tests/lang/lb/test_exceptions.py +++ b/spacy/tests/lang/lb/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_prefix_suffix_infix.py b/spacy/tests/lang/lb/test_prefix_suffix_infix.py index d85f932be..3958d1543 100644 --- a/spacy/tests/lang/lb/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py index 36464b379..b0ba76b6b 100644 --- a/spacy/tests/lang/lb/test_text.py +++ b/spacy/tests/lang/lb/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index cac32aa4d..8d9201cd9 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nb/test_tokenizer.py b/spacy/tests/lang/nb/test_tokenizer.py index f72d310e8..2da6e8d40 100644 --- a/spacy/tests/lang/nb/test_tokenizer.py +++ b/spacy/tests/lang/nb/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/nl/test_text.py b/spacy/tests/lang/nl/test_text.py index 4045b1c39..8bc72cc6d 100644 --- a/spacy/tests/lang/nl/test_text.py +++ b/spacy/tests/lang/nl/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.nl.lex_attrs import like_num diff --git a/spacy/tests/lang/pl/test_text.py b/spacy/tests/lang/pl/test_text.py index ec9b18084..e8654a498 100644 --- a/spacy/tests/lang/pl/test_text.py +++ b/spacy/tests/lang/pl/test_text.py @@ -1,9 +1,4 @@ -# coding: utf-8 """Words like numbers are recognized correctly.""" - - -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/pl/test_tokenizer.py b/spacy/tests/lang/pl/test_tokenizer.py index 9d0034589..a04b4fdcb 100644 --- a/spacy/tests/lang/pl/test_tokenizer.py +++ b/spacy/tests/lang/pl/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest DOT_TESTS = [ diff --git a/spacy/tests/lang/pt/test_text.py b/spacy/tests/lang/pt/test_text.py index 39dfff2c1..3a9162b80 100644 --- a/spacy/tests/lang/pt/test_text.py +++ b/spacy/tests/lang/pt/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.pt.lex_attrs import like_num diff --git a/spacy/tests/lang/ro/test_tokenizer.py b/spacy/tests/lang/ro/test_tokenizer.py index a327174e5..64c072470 100644 --- a/spacy/tests/lang/ro/test_tokenizer.py +++ b/spacy/tests/lang/ro/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_exceptions.py b/spacy/tests/lang/ru/test_exceptions.py index a8f0c3429..4fb417df8 100644 --- a/spacy/tests/lang/ru/test_exceptions.py +++ b/spacy/tests/lang/ru/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ru/test_lemmatizer.py b/spacy/tests/lang/ru/test_lemmatizer.py index b228fded8..40dcf4cf8 100644 --- a/spacy/tests/lang/ru/test_lemmatizer.py +++ b/spacy/tests/lang/ru/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/ru/test_text.py b/spacy/tests/lang/ru/test_text.py index c5bff6973..b0eaf66bb 100644 --- a/spacy/tests/lang/ru/test_text.py +++ b/spacy/tests/lang/ru/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.ru.lex_attrs import like_num diff --git a/spacy/tests/lang/ru/test_tokenizer.py b/spacy/tests/lang/ru/test_tokenizer.py index 5507f9f09..e05a479aa 100644 --- a/spacy/tests/lang/ru/test_tokenizer.py +++ b/spacy/tests/lang/ru/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sr/test_exceptions.py b/spacy/tests/lang/sr/test_exceptions.py index 285e99996..fa92e5e2d 100644 --- a/spacy/tests/lang/sr/test_exceptions.py +++ b/spacy/tests/lang/sr/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sr/test_tokenizer.py b/spacy/tests/lang/sr/test_tokenizer.py index c4672b3ef..03a0470bd 100644 --- a/spacy/tests/lang/sr/test_tokenizer.py +++ b/spacy/tests/lang/sr/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_exceptions.py b/spacy/tests/lang/sv/test_exceptions.py index c977a4183..5d3acf3d5 100644 --- a/spacy/tests/lang/sv/test_exceptions.py +++ b/spacy/tests/lang/sv/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py index ac7c066ba..ad335c317 100644 --- a/spacy/tests/lang/sv/test_noun_chunks.py +++ b/spacy/tests/lang/sv/test_noun_chunks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ...util import get_doc diff --git a/spacy/tests/lang/sv/test_prefix_suffix_infix.py b/spacy/tests/lang/sv/test_prefix_suffix_infix.py index f3fdd9a9e..bbb0ff415 100644 --- a/spacy/tests/lang/sv/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/sv/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/sv/test_text.py b/spacy/tests/lang/sv/test_text.py index 9ea1851ae..dc4911ab6 100644 --- a/spacy/tests/lang/sv/test_text.py +++ b/spacy/tests/lang/sv/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - def test_sv_tokenizer_handles_long_text(sv_tokenizer): text = """Det var så härligt ute på landet. Det var sommar, majsen var gul, havren grön, diff --git a/spacy/tests/lang/sv/test_tokenizer.py b/spacy/tests/lang/sv/test_tokenizer.py index 894b5aa6a..8871f4414 100644 --- a/spacy/tests/lang/sv/test_tokenizer.py +++ b/spacy/tests/lang/sv/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/test_attrs.py b/spacy/tests/lang/test_attrs.py index 4bb5aac70..b39109455 100644 --- a/spacy/tests/lang/test_attrs.py +++ b/spacy/tests/lang/test_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import intify_attrs, ORTH, NORM, LEMMA, IS_ALPHA from spacy.lang.lex_attrs import is_punct, is_ascii, is_currency, like_url, word_shape diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py index 5c701fc22..de1871e64 100644 --- a/spacy/tests/lang/test_initialize.py +++ b/spacy/tests/lang/test_initialize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class diff --git a/spacy/tests/lang/th/test_tokenizer.py b/spacy/tests/lang/th/test_tokenizer.py index 265c7753d..1e1ba52dc 100644 --- a/spacy/tests/lang/th/test_tokenizer.py +++ b/spacy/tests/lang/th/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/tt/test_tokenizer.py b/spacy/tests/lang/tt/test_tokenizer.py index 66ef9c181..7e0748931 100644 --- a/spacy/tests/lang/tt/test_tokenizer.py +++ b/spacy/tests/lang/tt/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer.py b/spacy/tests/lang/uk/test_tokenizer.py index f744b32b0..eb647a041 100644 --- a/spacy/tests/lang/uk/test_tokenizer.py +++ b/spacy/tests/lang/uk/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/uk/test_tokenizer_exc.py b/spacy/tests/lang/uk/test_tokenizer_exc.py index 328e1d287..4fb4a6b31 100644 --- a/spacy/tests/lang/uk/test_tokenizer_exc.py +++ b/spacy/tests/lang/uk/test_tokenizer_exc.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_prefix_suffix_infix.py b/spacy/tests/lang/ur/test_prefix_suffix_infix.py index de11c9b34..e9f3272f4 100644 --- a/spacy/tests/lang/ur/test_prefix_suffix_infix.py +++ b/spacy/tests/lang/ur/test_prefix_suffix_infix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/ur/test_text.py b/spacy/tests/lang/ur/test_text.py index 546e79182..5da831cf8 100644 --- a/spacy/tests/lang/ur/test_text.py +++ b/spacy/tests/lang/ur/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/yo/test_text.py b/spacy/tests/lang/yo/test_text.py index ce6408b67..48b689f3d 100644 --- a/spacy/tests/lang/yo/test_text.py +++ b/spacy/tests/lang/yo/test_text.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.yo.lex_attrs import like_num diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py index 235f597a5..d48feaee5 100644 --- a/spacy/tests/lang/zh/test_text.py +++ b/spacy/tests/lang/zh/test_text.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py index 36d94beb5..f71785337 100644 --- a/spacy/tests/lang/zh/test_tokenizer.py +++ b/spacy/tests/lang/zh/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index e4584d03a..adeef834d 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from mock import Mock diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py index 240ace537..a6a82f2e2 100644 --- a/spacy/tests/matcher/test_matcher_logic.py +++ b/spacy/tests/matcher/test_matcher_logic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py index 2db2f9eb3..c879cc0fe 100644 --- a/spacy/tests/matcher/test_pattern_validation.py +++ b/spacy/tests/matcher/test_pattern_validation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py index 7a6585e06..23cd80d1d 100644 --- a/spacy/tests/matcher/test_phrase_matcher.py +++ b/spacy/tests/matcher/test_phrase_matcher.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from mock import Mock from spacy.matcher import PhraseMatcher diff --git a/spacy/tests/morphology/test_morph_features.py b/spacy/tests/morphology/test_morph_features.py index 41f807143..4cf6b1206 100644 --- a/spacy/tests/morphology/test_morph_features.py +++ b/spacy/tests/morphology/test_morph_features.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.morphology import Morphology from spacy.strings import StringStore, get_string_id diff --git a/spacy/tests/parser/test_add_label.py b/spacy/tests/parser/test_add_label.py index bee9db82e..a24fd143d 100644 --- a/spacy/tests/parser/test_add_label.py +++ b/spacy/tests/parser/test_add_label.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps diff --git a/spacy/tests/parser/test_arc_eager_oracle.py b/spacy/tests/parser/test_arc_eager_oracle.py index 0d9bd1ad0..dd593f7d3 100644 --- a/spacy/tests/parser/test_arc_eager_oracle.py +++ b/spacy/tests/parser/test_arc_eager_oracle.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.pipeline import DependencyParser diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 8329391ca..8d5043487 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English diff --git a/spacy/tests/parser/test_neural_parser.py b/spacy/tests/parser/test_neural_parser.py index 468b3ff40..0906fbb94 100644 --- a/spacy/tests/parser/test_neural_parser.py +++ b/spacy/tests/parser/test_neural_parser.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy._ml import Tok2Vec from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_nn_beam.py b/spacy/tests/parser/test_nn_beam.py index 9dca99255..24997e47c 100644 --- a/spacy/tests/parser/test_nn_beam.py +++ b/spacy/tests/parser/test_nn_beam.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.vocab import Vocab diff --git a/spacy/tests/parser/test_nonproj.py b/spacy/tests/parser/test_nonproj.py index 8bf8111c1..86d9a0180 100644 --- a/spacy/tests/parser/test_nonproj.py +++ b/spacy/tests/parser/test_nonproj.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.syntax.nonproj import ancestors, contains_cycle, is_nonproj_arc from spacy.syntax.nonproj import is_nonproj_tree diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index fb5301718..75091ec07 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ..util import get_doc, apply_transition_sequence diff --git a/spacy/tests/parser/test_parse_navigate.py b/spacy/tests/parser/test_parse_navigate.py index eb206458e..ed95718f1 100644 --- a/spacy/tests/parser/test_parse_navigate.py +++ b/spacy/tests/parser/test_parse_navigate.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/parser/test_preset_sbd.py b/spacy/tests/parser/test_preset_sbd.py index d935494d6..ed6aef096 100644 --- a/spacy/tests/parser/test_preset_sbd.py +++ b/spacy/tests/parser/test_preset_sbd.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps diff --git a/spacy/tests/parser/test_space_attachment.py b/spacy/tests/parser/test_space_attachment.py index 945173faf..59ae4e629 100644 --- a/spacy/tests/parser/test_space_attachment.py +++ b/spacy/tests/parser/test_space_attachment.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens.doc import Doc diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py index 198f11bcd..5c246538c 100644 --- a/spacy/tests/pipeline/test_analysis.py +++ b/spacy/tests/pipeline/test_analysis.py @@ -1,11 +1,7 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy.language from spacy.language import Language, component from spacy.analysis import print_summary, validate_attrs from spacy.analysis import get_assigns_for_attr, get_requires_for_attr -from spacy.compat import is_python2 from mock import Mock, ANY import pytest @@ -17,8 +13,7 @@ def test_component_decorator_function(): return doc assert test_component.name == "test" - if not is_python2: - assert test_component.__doc__ == "docstring" + assert test_component.__doc__ == "docstring" assert test_component("foo") == "foo" @@ -45,13 +40,12 @@ def test_component_decorator_class(): assert test_component("foo") == "foo" assert hasattr(test_component, "custom") assert test_component.custom("bar") == "bar" - if not is_python2: - assert TestComponent.__doc__ == "docstring1" - assert TestComponent.__call__.__doc__ == "docstring2" - assert TestComponent.custom.__doc__ == "docstring3" - assert test_component.__doc__ == "docstring1" - assert test_component.__call__.__doc__ == "docstring2" - assert test_component.custom.__doc__ == "docstring3" + assert TestComponent.__doc__ == "docstring1" + assert TestComponent.__call__.__doc__ == "docstring2" + assert TestComponent.custom.__doc__ == "docstring3" + assert test_component.__doc__ == "docstring1" + assert test_component.__call__.__doc__ == "docstring2" + assert test_component.custom.__doc__ == "docstring3" def test_component_decorator_assigns(): diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 8023f72a6..9ff5f8194 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index 660ad3b28..210a56cea 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language diff --git a/spacy/tests/pipeline/test_factories.py b/spacy/tests/pipeline/test_factories.py index 5efcc319a..0a9a4d3c9 100644 --- a/spacy/tests/pipeline/test_factories.py +++ b/spacy/tests/pipeline/test_factories.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.tokens import Span diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py index 5b5fcd2fd..ca983267f 100644 --- a/spacy/tests/pipeline/test_functions.py +++ b/spacy/tests/pipeline/test_functions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline.functions import merge_subtokens from ..util import get_doc diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py index 27fb57b18..3ec8b508d 100644 --- a/spacy/tests/pipeline/test_pipe_methods.py +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py index 359552c5b..78ab6d2d1 100644 --- a/spacy/tests/pipeline/test_sentencizer.py +++ b/spacy/tests/pipeline/test_sentencizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import spacy from spacy.pipeline import Sentencizer diff --git a/spacy/tests/pipeline/test_tagger.py b/spacy/tests/pipeline/test_tagger.py index a5bda9090..ca9dab009 100644 --- a/spacy/tests/pipeline/test_tagger.py +++ b/spacy/tests/pipeline/test_tagger.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 44834c2a8..9e37e92e1 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import random import numpy.random diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py index 61d2c9cd2..a3148aa90 100644 --- a/spacy/tests/regression/test_issue1-1000.py +++ b/spacy/tests/regression/test_issue1-1000.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import random from spacy.matcher import Matcher diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py index 924c5aa3e..7d81c3148 100644 --- a/spacy/tests/regression/test_issue1001-1500.py +++ b/spacy/tests/regression/test_issue1001-1500.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py index 4b27901ad..d9e1d663a 100644 --- a/spacy/tests/regression/test_issue1501-2000.py +++ b/spacy/tests/regression/test_issue1501-2000.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import gc import numpy diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index e95c1a9b9..2c25b6d73 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py index 0acb25e90..49e7de179 100644 --- a/spacy/tests/regression/test_issue2501-3000.py +++ b/spacy/tests/regression/test_issue2501-3000.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py index d05759c31..cc893e472 100644 --- a/spacy/tests/regression/test_issue3001-3500.py +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English from spacy.lang.de import German @@ -9,11 +6,10 @@ from spacy.matcher import Matcher, PhraseMatcher from spacy.tokens import Doc from spacy.vocab import Vocab from spacy.attrs import ENT_IOB, ENT_TYPE -from spacy.compat import pickle, is_python2, unescape_unicode +from spacy.compat import pickle from spacy import displacy from spacy.util import decaying import numpy -import re from spacy.vectors import Vectors from ..util import get_doc @@ -211,73 +207,6 @@ def test_issue3345(): assert ner.moves.is_valid(state, "B-GPE") -if is_python2: - # If we have this test in Python 3, pytest chokes, as it can't print the - # string above in the xpass message. - prefix_search = ( - b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" - b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" - b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" - b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" - b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" - b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" - b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" - b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" - b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" - b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" - b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" - b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" - b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" - b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" - b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" - b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" - b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" - b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" - b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" - b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" - b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" - b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" - b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" - b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" - b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" - b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" - b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" - b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" - b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" - b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" - b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" - b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" - b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" - b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" - b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" - b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" - b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" - b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" - b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" - b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" - b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" - b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" - b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" - b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" - b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" - b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" - b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" - b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" - b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" - b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" - b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" - b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" - b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" - b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" - b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" - b"\\U0001FA60-\\U0001FA6D]" - ) - - def test_issue3356(): - pattern = re.compile(unescape_unicode(prefix_search.decode("utf8"))) - assert not pattern.search("hello") - - def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() diff --git a/spacy/tests/regression/test_issue3521.py b/spacy/tests/regression/test_issue3521.py index 35731ac12..3d8ee9922 100644 --- a/spacy/tests/regression/test_issue3521.py +++ b/spacy/tests/regression/test_issue3521.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py index c6f513730..aa77028fb 100644 --- a/spacy/tests/regression/test_issue3526.py +++ b/spacy/tests/regression/test_issue3526.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Span from spacy.language import Language diff --git a/spacy/tests/regression/test_issue3531.py b/spacy/tests/regression/test_issue3531.py index 7b9d0bd2a..4c65a5bfe 100644 --- a/spacy/tests/regression/test_issue3531.py +++ b/spacy/tests/regression/test_issue3531.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy import displacy diff --git a/spacy/tests/regression/test_issue3540.py b/spacy/tests/regression/test_issue3540.py index 19d89c797..be9e04b0b 100644 --- a/spacy/tests/regression/test_issue3540.py +++ b/spacy/tests/regression/test_issue3540.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc import numpy as np diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py index 587b3a857..b3af59c2e 100644 --- a/spacy/tests/regression/test_issue3549.py +++ b/spacy/tests/regression/test_issue3549.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.matcher import Matcher from spacy.errors import MatchPatternError diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py index 8444f11f2..de047bcbc 100644 --- a/spacy/tests/regression/test_issue3555.py +++ b/spacy/tests/regression/test_issue3555.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc, Token from spacy.matcher import Matcher diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py index bc8603888..367961ab1 100644 --- a/spacy/tests/regression/test_issue3611.py +++ b/spacy/tests/regression/test_issue3611.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py index d935db17f..51561b3ac 100644 --- a/spacy/tests/regression/test_issue3625.py +++ b/spacy/tests/regression/test_issue3625.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.hi import Hindi diff --git a/spacy/tests/regression/test_issue3803.py b/spacy/tests/regression/test_issue3803.py index 37d15a5cf..ab5250edf 100644 --- a/spacy/tests/regression/test_issue3803.py +++ b/spacy/tests/regression/test_issue3803.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.es import Spanish diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index fe722a681..27b1f5f29 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 62e8eabd6..0a851e869 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py index 5cd245231..8500c09aa 100644 --- a/spacy/tests/regression/test_issue3879.py +++ b/spacy/tests/regression/test_issue3879.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index c060473f5..6e8ab6f43 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English import pytest diff --git a/spacy/tests/regression/test_issue3882.py b/spacy/tests/regression/test_issue3882.py index 1b2dcea25..fa616db1d 100644 --- a/spacy/tests/regression/test_issue3882.py +++ b/spacy/tests/regression/test_issue3882.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.displacy import parse_deps from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py index 33230112f..6e4c9eeaa 100644 --- a/spacy/tests/regression/test_issue3951.py +++ b/spacy/tests/regression/test_issue3951.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue3959.py b/spacy/tests/regression/test_issue3959.py index c1f7fe100..7db28a31f 100644 --- a/spacy/tests/regression/test_issue3959.py +++ b/spacy/tests/regression/test_issue3959.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from ..util import make_tempdir diff --git a/spacy/tests/regression/test_issue3962.py b/spacy/tests/regression/test_issue3962.py index ae60fa0fa..971c9b08e 100644 --- a/spacy/tests/regression/test_issue3962.py +++ b/spacy/tests/regression/test_issue3962.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from ..util import get_doc diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py index 22b8d486e..fe5388950 100644 --- a/spacy/tests/regression/test_issue3972.py +++ b/spacy/tests/regression/test_issue3972.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py index d075128aa..3ac26d3ab 100644 --- a/spacy/tests/regression/test_issue4002.py +++ b/spacy/tests/regression/test_issue4002.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import PhraseMatcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py index e774feb2d..7153594db 100644 --- a/spacy/tests/regression/test_issue4030.py +++ b/spacy/tests/regression/test_issue4030.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.util import minibatch, compounding diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py index 00a8882d3..6644a8eda 100644 --- a/spacy/tests/regression/test_issue4042.py +++ b/spacy/tests/regression/test_issue4042.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import spacy from spacy.pipeline import EntityRecognizer, EntityRuler from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4054.py b/spacy/tests/regression/test_issue4054.py index cc84cebf8..c52ded395 100644 --- a/spacy/tests/regression/test_issue4054.py +++ b/spacy/tests/regression/test_issue4054.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.vocab import Vocab import spacy from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py index d288f46c4..4849aa238 100644 --- a/spacy/tests/regression/test_issue4120.py +++ b/spacy/tests/regression/test_issue4120.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher from spacy.tokens import Doc diff --git a/spacy/tests/regression/test_issue4133.py b/spacy/tests/regression/test_issue4133.py index 93262f8cf..a726806d7 100644 --- a/spacy/tests/regression/test_issue4133.py +++ b/spacy/tests/regression/test_issue4133.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokens import Doc from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4190.py b/spacy/tests/regression/test_issue4190.py index eb4eb8648..97d532d2a 100644 --- a/spacy/tests/regression/test_issue4190.py +++ b/spacy/tests/regression/test_issue4190.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.tokenizer import Tokenizer from spacy import util diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py index ef871bf9f..891f03b30 100644 --- a/spacy/tests/regression/test_issue4267.py +++ b/spacy/tests/regression/test_issue4267.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4272.py b/spacy/tests/regression/test_issue4272.py index c57704d71..4bac97a44 100644 --- a/spacy/tests/regression/test_issue4272.py +++ b/spacy/tests/regression/test_issue4272.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.el import Greek diff --git a/spacy/tests/regression/test_issue4278.py b/spacy/tests/regression/test_issue4278.py index cb09340ff..ffbc41226 100644 --- a/spacy/tests/regression/test_issue4278.py +++ b/spacy/tests/regression/test_issue4278.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.language import Language from spacy.pipeline import Pipe diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py index c68f745a7..a3f6f69df 100644 --- a/spacy/tests/regression/test_issue4313.py +++ b/spacy/tests/regression/test_issue4313.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from collections import defaultdict from spacy.pipeline import EntityRecognizer diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py index 484d5d280..4978e0c8e 100644 --- a/spacy/tests/regression/test_issue4348.py +++ b/spacy/tests/regression/test_issue4348.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.util import minibatch, compounding import pytest diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py index ab6192744..917847a05 100644 --- a/spacy/tests/regression/test_issue4367.py +++ b/spacy/tests/regression/test_issue4367.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import DocBin diff --git a/spacy/tests/regression/test_issue4373.py b/spacy/tests/regression/test_issue4373.py index 57d7547da..dbde1624e 100644 --- a/spacy/tests/regression/test_issue4373.py +++ b/spacy/tests/regression/test_issue4373.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.matcher import Matcher, PhraseMatcher from spacy.vocab import Vocab diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py index 89332ca2f..80d37b1e6 100644 --- a/spacy/tests/regression/test_issue4402.py +++ b/spacy/tests/regression/test_issue4402.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import srsly from spacy.gold import GoldCorpus from spacy.lang.en import English diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py index 460449003..6f96c9f2d 100644 --- a/spacy/tests/regression/test_issue4528.py +++ b/spacy/tests/regression/test_issue4528.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.tokens import Doc, DocBin diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py index 381957be6..fa962c053 100644 --- a/spacy/tests/regression/test_issue4529.py +++ b/spacy/tests/regression/test_issue4529.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.gold import GoldParse diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py index 8ec9a0bd1..74bb5de10 100644 --- a/spacy/tests/regression/test_issue4590.py +++ b/spacy/tests/regression/test_issue4590.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from mock import Mock from spacy.matcher import DependencyMatcher from ..util import get_doc diff --git a/spacy/tests/regression/test_issue4651.py b/spacy/tests/regression/test_issue4651.py index eb49f4a38..3f6c1a57c 100644 --- a/spacy/tests/regression/test_issue4651.py +++ b/spacy/tests/regression/test_issue4651.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.lang.en import English from spacy.pipeline import EntityRuler diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py index 8fa4f9259..149e1431b 100644 --- a/spacy/tests/regression/test_issue4674.py +++ b/spacy/tests/regression/test_issue4674.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.kb import KnowledgeBase from spacy.util import ensure_path diff --git a/spacy/tests/regression/test_issue4707.py b/spacy/tests/regression/test_issue4707.py index e710881d7..d9798ef84 100644 --- a/spacy/tests/regression/test_issue4707.py +++ b/spacy/tests/regression/test_issue4707.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from spacy.util import load_model_from_path from spacy.lang.en import English diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index ef2b1ee89..615bb1cd9 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -1,13 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import spacy - import pytest - from spacy.lang.en import English from spacy.tokens import Doc, DocBin -from spacy.compat import path2str from ..util import make_tempdir @@ -43,7 +37,7 @@ def test_serialize_doc_roundtrip_disk_str_path(en_vocab): doc = Doc(en_vocab, words=["hello", "world"]) with make_tempdir() as d: file_path = d / "doc" - file_path = path2str(file_path) + file_path = str(file_path) doc.to_disk(file_path) doc_d = Doc(en_vocab).from_disk(file_path) assert doc.to_bytes() == doc_d.to_bytes() diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index 1881b7d0c..b8a31ab5e 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.vocab import Vocab @@ -10,9 +7,7 @@ from spacy.vocab import Vocab def doc_w_attrs(en_tokenizer): Doc.set_extension("_test_attr", default=False) Doc.set_extension("_test_prop", getter=lambda doc: len(doc.text)) - Doc.set_extension( - "_test_method", method=lambda doc, arg: "{}{}".format(len(doc.text), arg) - ) + Doc.set_extension("_test_method", method=lambda doc, arg: f"{len(doc.text)}{arg}") doc = en_tokenizer("This is a test.") doc._._test_attr = "test" return doc @@ -24,4 +19,4 @@ def test_serialize_ext_attrs_from_bytes(doc_w_attrs): assert doc._.has("_test_attr") assert doc._._test_attr == "test" assert doc._._test_prop == len(doc.text) - assert doc._._test_method("test") == "{}{}".format(len(doc.text), "test") + assert doc._._test_method("test") == f"{len(doc.text)}test" diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index b19c11864..91036a496 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.util import ensure_path from spacy.kb import KnowledgeBase diff --git a/spacy/tests/serialize/test_serialize_language.py b/spacy/tests/serialize/test_serialize_language.py index efc5d181c..4089a0d07 100644 --- a/spacy/tests/serialize/test_serialize_language.py +++ b/spacy/tests/serialize/test_serialize_language.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import re from spacy.language import Language diff --git a/spacy/tests/serialize/test_serialize_pipeline.py b/spacy/tests/serialize/test_serialize_pipeline.py index 797fa95f8..0ad9bc4d4 100644 --- a/spacy/tests/serialize/test_serialize_pipeline.py +++ b/spacy/tests/serialize/test_serialize_pipeline.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.pipeline import Tagger, DependencyParser, EntityRecognizer from spacy.pipeline import Tensorizer, TextCategorizer, SentenceRecognizer diff --git a/spacy/tests/serialize/test_serialize_tokenizer.py b/spacy/tests/serialize/test_serialize_tokenizer.py index 9a273980c..f504ed048 100644 --- a/spacy/tests/serialize/test_serialize_tokenizer.py +++ b/spacy/tests/serialize/test_serialize_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class from spacy.tokenizer import Tokenizer diff --git a/spacy/tests/serialize/test_serialize_vocab_strings.py b/spacy/tests/serialize/test_serialize_vocab_strings.py index 1671845ee..359a0657f 100644 --- a/spacy/tests/serialize/test_serialize_vocab_strings.py +++ b/spacy/tests/serialize/test_serialize_vocab_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.strings import StringStore diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py index 77f1af020..ad56e4c54 100644 --- a/spacy/tests/test_architectures.py +++ b/spacy/tests/test_architectures.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy import registry from thinc.v2v import Affine diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 3b75e760a..b4aebe521 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.en import English diff --git a/spacy/tests/test_displacy.py b/spacy/tests/test_displacy.py index 2d1f1bd8f..4436b437f 100644 --- a/spacy/tests/test_displacy.py +++ b/spacy/tests/test_displacy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy import displacy from spacy.displacy.render import DependencyRenderer @@ -80,10 +77,10 @@ def test_displacy_rtl(): html = displacy.render(doc, page=True, style="dep") assert "direction: rtl" in html assert 'direction="rtl"' in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html html = displacy.render(doc, page=True, style="ent") assert "direction: rtl" in html - assert 'lang="{}"'.format(nlp.lang) in html + assert f'lang="{nlp.lang}"' in html def test_displacy_render_wrapper(en_vocab): diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index 9d644d062..46c54b879 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -1,9 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import spacy from spacy.errors import AlignmentError -from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Example, DocAnnotation +from spacy.gold import ( + biluo_tags_from_offsets, + offsets_from_biluo_tags, + Example, + DocAnnotation, +) from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo from spacy.gold import GoldCorpus, docs_to_json, align from spacy.lang.en import English @@ -14,14 +15,37 @@ from .util import make_tempdir import pytest import srsly + @pytest.fixture def doc(): text = "Sarah's sister flew to Silicon Valley via London." - tags = ['NNP', 'POS', 'NN', 'VBD', 'IN', 'NNP', 'NNP', 'IN', 'NNP', '.'] + tags = ["NNP", "POS", "NN", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."] # head of '.' is intentionally nonprojective for testing heads = [2, 0, 3, 3, 3, 6, 4, 3, 7, 5] - deps = ['poss', 'case', 'nsubj', 'ROOT', 'prep', 'compound', 'pobj', 'prep', 'pobj', 'punct'] - lemmas = ['Sarah', "'s", 'sister', 'fly', 'to', 'Silicon', 'Valley', 'via', 'London', '.'] + deps = [ + "poss", + "case", + "nsubj", + "ROOT", + "prep", + "compound", + "pobj", + "prep", + "pobj", + "punct", + ] + lemmas = [ + "Sarah", + "'s", + "sister", + "fly", + "to", + "Silicon", + "Valley", + "via", + "London", + ".", + ] biluo_tags = ["U-PERSON", "O", "O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] cats = {"TRAVEL": 1.0, "BAKING": 0.0} nlp = English() @@ -45,7 +69,7 @@ def merged_dict(): "words": ["Hi", "there", "everyone", "It", "is", "just", "me"], "tags": ["INTJ", "ADV", "PRON", "PRON", "AUX", "ADV", "PRON"], "sent_starts": [1, 0, 0, 1, 0, 0, 0, 0], - } + } def test_gold_biluo_U(en_vocab): @@ -141,7 +165,9 @@ def test_roundtrip_docs_to_json(doc): deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] lemmas = [t.lemma_ for t in doc] - biluo_tags = iob_to_biluo([t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc]) + biluo_tags = iob_to_biluo( + [t.ent_iob_ + "-" + t.ent_type_ if t.ent_type_ else "O" for t in doc] + ) cats = doc.cats # roundtrip to JSON @@ -214,7 +240,6 @@ def test_roundtrip_docs_to_json(doc): def test_projective_train_vs_nonprojective_dev(doc): nlp = English() - text = doc.text deps = [t.dep_ for t in doc] heads = [t.head.i for t in doc] @@ -244,9 +269,6 @@ def test_projective_train_vs_nonprojective_dev(doc): def test_ignore_misaligned(doc): nlp = English() text = doc.text - deps = [t.dep_ for t in doc] - heads = [t.head.i for t in doc] - with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" data = [docs_to_json(doc)] @@ -268,17 +290,12 @@ def test_ignore_misaligned(doc): # doesn't raise an AlignmentError, but there is nothing to iterate over # because the only example can't be aligned - train_reloaded_example = list(goldcorpus.train_dataset(nlp, - ignore_misaligned=True)) + train_reloaded_example = list(goldcorpus.train_dataset(nlp, ignore_misaligned=True)) assert len(train_reloaded_example) == 0 def test_make_orth_variants(doc): nlp = English() - text = doc.text - deps = [t.dep_ for t in doc] - heads = [t.head.i for t in doc] - with make_tempdir() as tmpdir: jsonl_file = tmpdir / "test.jsonl" # write to JSONL train dicts @@ -286,9 +303,8 @@ def test_make_orth_variants(doc): goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file)) # due to randomness, test only that this runs with no errors for now - train_reloaded_example = next(goldcorpus.train_dataset(nlp, - orth_variant_level=0.2)) - train_goldparse = train_reloaded_example.gold + train_reloaded_example = next(goldcorpus.train_dataset(nlp, orth_variant_level=0.2)) + train_goldparse = train_reloaded_example.gold # noqa: F841 @pytest.mark.parametrize( diff --git a/spacy/tests/test_json_schemas.py b/spacy/tests/test_json_schemas.py index 89e797c1a..1330d3a65 100644 --- a/spacy/tests/test_json_schemas.py +++ b/spacy/tests/test_json_schemas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from spacy.util import get_json_validator, validate_json, validate_schema from spacy.cli._schemas import META_SCHEMA, TRAINING_SCHEMA from spacy.matcher._schemas import TOKEN_PATTERN_SCHEMA diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 102b87142..58db0a040 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -1,10 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools - import pytest -from spacy.compat import is_python2 from spacy.gold import GoldParse from spacy.language import Language from spacy.tokens import Doc, Span @@ -134,9 +129,6 @@ def test_language_pipe(nlp2, n_process, texts): assert_docs_equal(doc, expected_doc) -@pytest.mark.skipif( - is_python2, reason="python2 seems to be unable to handle iterator properly" -) @pytest.mark.parametrize("n_process", [1, 2]) def test_language_pipe_stream(nlp2, n_process, texts): # check if nlp.pipe can handle infinite length iterator properly. diff --git a/spacy/tests/test_lemmatizer.py b/spacy/tests/test_lemmatizer.py index 701222afc..c2534ca22 100644 --- a/spacy/tests/test_lemmatizer.py +++ b/spacy/tests/test_lemmatizer.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from spacy.tokens import Doc from spacy.language import Language diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 4075ccf64..09e0fb561 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -1,13 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import os import ctypes from pathlib import Path from spacy import util from spacy import prefer_gpu, require_gpu -from spacy.compat import symlink_to, symlink_remove, path2str, is_windows +from spacy.compat import symlink_to, symlink_remove, is_windows from spacy._ml import PrecomputableAffine from subprocess import CalledProcessError @@ -25,7 +22,7 @@ def symlink(): @pytest.fixture(scope="function") def symlink_setup_target(request, symlink_target, symlink): if not symlink_target.exists(): - os.mkdir(path2str(symlink_target)) + os.mkdir(str(symlink_target)) # yield -- need to cleanup even if assertion fails # https://github.com/pytest-dev/pytest/issues/2508#issuecomment-309934240 @@ -33,7 +30,7 @@ def symlink_setup_target(request, symlink_target, symlink): # Remove symlink only if it was created if symlink.exists(): symlink_remove(symlink) - os.rmdir(path2str(symlink_target)) + os.rmdir(str(symlink_target)) request.addfinalizer(cleanup) diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 65288527a..e4c67b672 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy import srsly diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py index 888028b6c..efaf80b4f 100644 --- a/spacy/tests/test_scorer.py +++ b/spacy/tests/test_scorer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from numpy.testing import assert_almost_equal, assert_array_almost_equal import pytest from pytest import approx diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py index ddaa71059..473d5017d 100644 --- a/spacy/tests/test_tok2vec.py +++ b/spacy/tests/test_tok2vec.py @@ -1,12 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy._ml import Tok2Vec from spacy.vocab import Vocab from spacy.tokens import Doc -from spacy.compat import unicode_ def get_batch(batch_size): @@ -16,7 +12,7 @@ def get_batch(batch_size): for size in range(1, batch_size + 1): # Make the words numbers, so that they're distnct # across the batch, and easy to track. - numbers = [unicode_(i) for i in range(start, start + size)] + numbers = [str(i) for i in range(start, start + size)] docs.append(Doc(vocab, words=numbers)) start += size return docs diff --git a/spacy/tests/tokenizer/test_exceptions.py b/spacy/tests/tokenizer/test_exceptions.py index c2011487e..8276d7aea 100644 --- a/spacy/tests/tokenizer/test_exceptions.py +++ b/spacy/tests/tokenizer/test_exceptions.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import sys import pytest diff --git a/spacy/tests/tokenizer/test_explain.py b/spacy/tests/tokenizer/test_explain.py index 2d71588cc..3e7681234 100644 --- a/spacy/tests/tokenizer/test_explain.py +++ b/spacy/tests/tokenizer/test_explain.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.util import get_lang_class @@ -58,7 +55,7 @@ LANGUAGES = [ @pytest.mark.parametrize("lang", LANGUAGES) def test_tokenizer_explain(lang): tokenizer = get_lang_class(lang).Defaults.create_tokenizer() - examples = pytest.importorskip("spacy.lang.{}.examples".format(lang)) + examples = pytest.importorskip(f"spacy.lang.{lang}.examples") for sentence in examples.sentences: tokens = [t.text for t in tokenizer(sentence) if not t.is_space] debug_tokens = [t[1] for t in tokenizer.explain(sentence)] diff --git a/spacy/tests/tokenizer/test_naughty_strings.py b/spacy/tests/tokenizer/test_naughty_strings.py index 36c69611e..e93d5654f 100644 --- a/spacy/tests/tokenizer/test_naughty_strings.py +++ b/spacy/tests/tokenizer/test_naughty_strings.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest # Examples taken from the "Big List of Naughty Strings" diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 5ac681c5e..3dce1ae31 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.vocab import Vocab from spacy.tokenizer import Tokenizer diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py index e2c0e3de8..9f673d5d8 100644 --- a/spacy/tests/tokenizer/test_urls.py +++ b/spacy/tests/tokenizer/test_urls.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lang.tokenizer_exceptions import BASE_EXCEPTIONS diff --git a/spacy/tests/tokenizer/test_whitespace.py b/spacy/tests/tokenizer/test_whitespace.py index 74c9b369b..c7b9d7c6d 100644 --- a/spacy/tests/tokenizer/test_whitespace.py +++ b/spacy/tests/tokenizer/test_whitespace.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest diff --git a/spacy/tests/util.py b/spacy/tests/util.py index 175480fe7..0516e9272 100644 --- a/spacy/tests/util.py +++ b/spacy/tests/util.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import numpy import tempfile import shutil @@ -9,7 +6,6 @@ import srsly from pathlib import Path from spacy.tokens import Doc, Span from spacy.attrs import POS, HEAD, DEP -from spacy.compat import path2str @contextlib.contextmanager @@ -23,7 +19,7 @@ def make_tempfile(mode="r"): def make_tempdir(): d = Path(tempfile.mkdtemp()) yield d - shutil.rmtree(path2str(d)) + shutil.rmtree(str(d)) def get_doc(vocab, words=[], pos=None, heads=None, deps=None, tags=None, ents=None): diff --git a/spacy/tests/vocab_vectors/test_lexeme.py b/spacy/tests/vocab_vectors/test_lexeme.py index d84a56981..e033aa7c6 100644 --- a/spacy/tests/vocab_vectors/test_lexeme.py +++ b/spacy/tests/vocab_vectors/test_lexeme.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import IS_ALPHA, IS_DIGIT diff --git a/spacy/tests/vocab_vectors/test_lookups.py b/spacy/tests/vocab_vectors/test_lookups.py index f78dd33c4..fff3d24ef 100644 --- a/spacy/tests/vocab_vectors/test_lookups.py +++ b/spacy/tests/vocab_vectors/test_lookups.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.lookups import Lookups, Table from spacy.strings import get_string_id diff --git a/spacy/tests/vocab_vectors/test_similarity.py b/spacy/tests/vocab_vectors/test_similarity.py index f98f0e6e0..b5f7303b5 100644 --- a/spacy/tests/vocab_vectors/test_similarity.py +++ b/spacy/tests/vocab_vectors/test_similarity.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from spacy.tokens import Doc diff --git a/spacy/tests/vocab_vectors/test_stringstore.py b/spacy/tests/vocab_vectors/test_stringstore.py index 75b1116dd..c71d5f3f2 100644 --- a/spacy/tests/vocab_vectors/test_stringstore.py +++ b/spacy/tests/vocab_vectors/test_stringstore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.strings import StringStore diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py index b688ab9dd..8684ad018 100644 --- a/spacy/tests/vocab_vectors/test_vectors.py +++ b/spacy/tests/vocab_vectors/test_vectors.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest import numpy from numpy.testing import assert_allclose diff --git a/spacy/tests/vocab_vectors/test_vocab_api.py b/spacy/tests/vocab_vectors/test_vocab_api.py index d22db2d8b..a687059be 100644 --- a/spacy/tests/vocab_vectors/test_vocab_api.py +++ b/spacy/tests/vocab_vectors/test_vocab_api.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import pytest from spacy.attrs import LEMMA, ORTH, PROB, IS_ALPHA from spacy.parts_of_speech import NOUN, VERB diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index f0120c708..7491a11fc 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,8 +1,5 @@ # cython: embedsignature=True # cython: profile=True -# coding: utf8 -from __future__ import unicode_literals - from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc from libc.string cimport memcpy, memset @@ -11,22 +8,20 @@ from cymem.cymem cimport Pool from preshed.maps cimport PreshMap cimport cython -from collections import OrderedDict import re from .tokens.doc cimport Doc from .strings cimport hash_string -from .compat import unescape_unicode from .attrs import intify_attrs from .symbols import ORTH from .errors import Errors, Warnings, deprecation_warning from . import util - from .attrs import intify_attrs from .lexeme cimport EMPTY_LEXEME from .symbols import ORTH + cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. @@ -728,14 +723,14 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#to_bytes """ - serializers = OrderedDict(( - ("vocab", lambda: self.vocab.to_bytes()), - ("prefix_search", lambda: _get_regex_pattern(self.prefix_search)), - ("suffix_search", lambda: _get_regex_pattern(self.suffix_search)), - ("infix_finditer", lambda: _get_regex_pattern(self.infix_finditer)), - ("token_match", lambda: _get_regex_pattern(self.token_match)), - ("exceptions", lambda: OrderedDict(sorted(self._rules.items()))) - )) + serializers = { + "vocab": lambda: self.vocab.to_bytes(), + "prefix_search": lambda: _get_regex_pattern(self.prefix_search), + "suffix_search": lambda: _get_regex_pattern(self.suffix_search), + "infix_finditer": lambda: _get_regex_pattern(self.infix_finditer), + "token_match": lambda: _get_regex_pattern(self.token_match), + "exceptions": lambda: dict(sorted(self._rules.items())) + } exclude = util.get_serialization_exclude(serializers, exclude, kwargs) return util.to_bytes(serializers, exclude) @@ -748,20 +743,17 @@ cdef class Tokenizer: DOCS: https://spacy.io/api/tokenizer#from_bytes """ - data = OrderedDict() - deserializers = OrderedDict(( - ("vocab", lambda b: self.vocab.from_bytes(b)), - ("prefix_search", lambda b: data.setdefault("prefix_search", b)), - ("suffix_search", lambda b: data.setdefault("suffix_search", b)), - ("infix_finditer", lambda b: data.setdefault("infix_finditer", b)), - ("token_match", lambda b: data.setdefault("token_match", b)), - ("exceptions", lambda b: data.setdefault("rules", b)) - )) + data = {} + deserializers = { + "vocab": lambda b: self.vocab.from_bytes(b), + "prefix_search": lambda b: data.setdefault("prefix_search", b), + "suffix_search": lambda b: data.setdefault("suffix_search", b), + "infix_finditer": lambda b: data.setdefault("infix_finditer", b), + "token_match": lambda b: data.setdefault("token_match", b), + "exceptions": lambda b: data.setdefault("rules", b) + } exclude = util.get_serialization_exclude(deserializers, exclude, kwargs) msg = util.from_bytes(bytes_data, deserializers, exclude) - for key in ["prefix_search", "suffix_search", "infix_finditer"]: - if key in data: - data[key] = unescape_unicode(data[key]) if data.get("prefix_search"): self.prefix_search = re.compile(data["prefix_search"]).search if data.get("suffix_search"): diff --git a/spacy/tokens/__init__.py b/spacy/tokens/__init__.py index 536ec8349..88428709b 100644 --- a/spacy/tokens/__init__.py +++ b/spacy/tokens/__init__.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - from .doc import Doc from .token import Token from .span import Span diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx index a5d06491a..12690ba50 100644 --- a/spacy/tokens/_retokenize.pyx +++ b/spacy/tokens/_retokenize.pyx @@ -1,9 +1,6 @@ -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - from libc.string cimport memcpy, memset from libc.stdlib cimport malloc, free from cymem.cymem cimport Pool diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index b60a6d7b3..d7348659d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import numpy import zlib import srsly diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 716df1087..58423c420 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,10 +1,6 @@ - -# coding: utf8 # cython: infer_types=True # cython: bounds_check=False # cython: profile=True -from __future__ import unicode_literals - cimport cython cimport numpy as np from libc.string cimport memcpy, memset @@ -28,7 +24,7 @@ from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS from ..util import normalize_slice -from ..compat import is_config, copy_reg, pickle, basestring_ +from ..compat import copy_reg, pickle from ..errors import deprecation_warning, models_warning, user_warning from ..errors import Errors, Warnings from .. import util @@ -327,9 +323,7 @@ cdef class Doc: return "".join([t.text_with_ws for t in self]).encode("utf-8") def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() @@ -683,7 +677,7 @@ cdef class Doc: cdef np.ndarray[attr_t, ndim=2] output # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(py_attr_ids, basestring_): + if isinstance(py_attr_ids, str): # Handle inputs like doc.to_array('ORTH') py_attr_ids = [py_attr_ids] elif not hasattr(py_attr_ids, "__iter__"): @@ -772,7 +766,7 @@ cdef class Doc: """ # Handle scalar/list inputs of strings/ints for py_attr_ids # See also #3064 - if isinstance(attrs, basestring_): + if isinstance(attrs, str): # Handle inputs like doc.to_array('ORTH') attrs = [attrs] elif not hasattr(attrs, "__iter__"): diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 957e853ca..9e9322d65 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - cimport numpy as np from libc.math cimport sqrt @@ -20,7 +17,6 @@ from ..lexeme cimport Lexeme from ..symbols cimport dep from ..util import normalize_slice -from ..compat import is_config, basestring_ from ..errors import Errors, TempErrors, Warnings, user_warning, models_warning from ..errors import deprecation_warning from .underscore import Underscore, get_ext_args @@ -110,9 +106,9 @@ cdef class Span: self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) else: self.end_char = 0 - if isinstance(label, basestring_): + if isinstance(label, str): label = doc.vocab.strings.add(label) - if isinstance(kb_id, basestring_): + if isinstance(kb_id, str): kb_id = doc.vocab.strings.add(kb_id) if label not in doc.vocab.strings: raise ValueError(Errors.E084.format(label=label)) @@ -157,9 +153,7 @@ cdef class Span: return self.end - self.start def __repr__(self): - if is_config(python3=True): - return self.text - return self.text.encode("utf-8") + return self.text def __getitem__(self, object i): """Get a `Token` or a `Span` object @@ -478,7 +472,7 @@ cdef class Span: @property def tensor(self): """The span's slice of the doc's tensor. - + RETURNS (ndarray[ndim=2, dtype='float32']): A 2D numpy or cupy array representing the span's semantics. """ diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 8b15a4223..8e6290187 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,7 +1,4 @@ # cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals - from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free # Compiler crashes on memory view coercion without this. Should report bug. @@ -23,7 +20,6 @@ from ..symbols cimport conj from .. import parts_of_speech from .. import util -from ..compat import is_config from ..errors import Errors, Warnings, user_warning, models_warning from .underscore import Underscore, get_ext_args from .morphanalysis cimport MorphAnalysis @@ -122,9 +118,7 @@ cdef class Token: return self.text.encode('utf8') def __str__(self): - if is_config(python3=True): - return self.__unicode__() - return self.__bytes__() + return self.__unicode__() def __repr__(self): return self.__str__() diff --git a/spacy/tokens/underscore.py b/spacy/tokens/underscore.py index b36fe9294..328851945 100644 --- a/spacy/tokens/underscore.py +++ b/spacy/tokens/underscore.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import functools import copy diff --git a/spacy/util.py b/spacy/util.py index 693136bc1..4e6c10e2b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,12 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals, print_function - import os import importlib +import importlib.util import re from pathlib import Path import random -from collections import OrderedDict from thinc.neural._classes.model import Model from thinc.neural.ops import NumpyOps import functools @@ -27,8 +24,7 @@ except ImportError: cupy = None from .symbols import ORTH -from .compat import cupy, CudaStream, path2str, basestring_, unicode_ -from .compat import import_file +from .compat import cupy, CudaStream from .errors import Errors, Warnings, deprecation_warning @@ -119,7 +115,7 @@ def ensure_path(path): path: Anything. If string, it's converted to Path. RETURNS: Path or original argument. """ - if isinstance(path, basestring_): + if isinstance(path, str): return Path(path) else: return path @@ -138,7 +134,7 @@ def load_language_data(path): path = path.with_suffix(path.suffix + ".gz") if path.exists(): return srsly.read_gzip_json(path) - raise ValueError(Errors.E160.format(path=path2str(path))) + raise ValueError(Errors.E160.format(path=path)) def get_module_path(module): @@ -156,8 +152,8 @@ def load_model(name, **overrides): """ data_path = get_data_path() if not data_path or not data_path.exists(): - raise IOError(Errors.E049.format(path=path2str(data_path))) - if isinstance(name, basestring_): # in data dir / shortcut + raise IOError(Errors.E049.format(path=data_path)) + if isinstance(name, str): # in data dir / shortcut if name in set([d.name for d in data_path.iterdir()]): return load_model_from_link(name, **overrides) if is_package(name): # installed as package @@ -224,7 +220,7 @@ def load_model_from_init_py(init_file, **overrides): data_dir = "%s_%s-%s" % (meta["lang"], meta["name"], meta["version"]) data_path = model_path / data_dir if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(data_path))) + raise IOError(Errors.E052.format(path=data_path)) return load_model_from_path(data_path, meta, **overrides) @@ -236,7 +232,7 @@ def get_model_meta(path): """ model_path = ensure_path(path) if not model_path.exists(): - raise IOError(Errors.E052.format(path=path2str(model_path))) + raise IOError(Errors.E052.format(path=model_path)) meta_path = model_path / "meta.json" if not meta_path.is_file(): raise IOError(Errors.E053.format(path=meta_path)) @@ -417,7 +413,7 @@ def update_exc(base_exceptions, *addition_dicts): exc = dict(base_exceptions) for additions in addition_dicts: for orth, token_attrs in additions.items(): - if not all(isinstance(attr[ORTH], unicode_) for attr in token_attrs): + if not all(isinstance(attr[ORTH], str) for attr in token_attrs): raise ValueError(Errors.E055.format(key=orth, orths=token_attrs)) described_orth = "".join(attr[ORTH] for attr in token_attrs) if orth != described_orth: @@ -612,7 +608,7 @@ def filter_spans(spans): def to_bytes(getters, exclude): - serialized = OrderedDict() + serialized = {} for key, getter in getters.items(): # Split to support file names like meta.json if key.split(".")[0] not in exclude: @@ -649,6 +645,20 @@ def from_disk(path, readers, exclude): return path +def import_file(name, loc): + """Import module from a file. Used to load models from a directory. + + name (unicode): Name of module to load. + loc (unicode / Path): Path to the file. + RETURNS: The loaded module. + """ + loc = str(loc) + spec = importlib.util.spec_from_file_location(name, str(loc)) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + def minify_html(html): """Perform a template-specific, rudimentary HTML minification for displaCy. Disclaimer: NOT a general-purpose solution, only removes indentation and @@ -726,8 +736,8 @@ def validate_json(data, validator): err_path = "" msg = err.message + " " + err_path if err.context: # Error has suberrors, e.g. if schema uses anyOf - suberrs = [" - {}".format(suberr.message) for suberr in err.context] - msg += ":\n{}".format("".join(suberrs)) + suberrs = [f" - {suberr.message}" for suberr in err.context] + msg += f":\n{''.join(suberrs)}" errors.append(msg) return errors diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 6b26bf123..b12c8d833 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,13 +1,9 @@ -# coding: utf8 -from __future__ import unicode_literals - cimport numpy as np from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset import functools import numpy -from collections import OrderedDict import srsly from thinc.neural.util import get_array_module from thinc.neural._classes.model import Model @@ -15,7 +11,6 @@ from thinc.neural._classes.model import Model from .strings cimport StringStore from .strings import get_string_id -from .compat import basestring_, path2str from .errors import Errors from . import util @@ -74,7 +69,7 @@ cdef class Vectors: shape = (0,0) data = numpy.zeros(shape, dtype="f") self.data = data - self.key2row = OrderedDict() + self.key2row = {} if self.data is not None: self._unset = cppset[int]({i for i in range(self.data.shape[0])}) else: @@ -339,7 +334,7 @@ cdef class Vectors: sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1] scores[i:i+batch_size] = scores[sorted_index] best_rows[i:i+batch_size] = best_rows[sorted_index] - + xp = get_array_module(self.data) # Round values really close to 1 or -1 scores = xp.around(scores, decimals=4, out=scores) @@ -347,7 +342,7 @@ cdef class Vectors: scores = xp.clip(scores, a_min=-1, a_max=1, out=scores) row2key = {row: key for key, row in self.key2row.items()} keys = xp.asarray( - [[row2key[row] for row in best_rows[i] if row in row2key] + [[row2key[row] for row in best_rows[i] if row in row2key] for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) @@ -372,7 +367,7 @@ cdef class Vectors: break else: raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) + bin_loc = path / f"vectors.{dims}.{dtype}.bin" xp = get_array_module(self.data) self.data = None with bin_loc.open("rb") as file_: @@ -402,10 +397,10 @@ cdef class Vectors: save_array = lambda arr, file_: xp.save(file_, arr, allow_pickle=False) else: save_array = lambda arr, file_: xp.save(file_, arr) - serializers = OrderedDict(( - ("vectors", lambda p: save_array(self.data, p.open("wb"))), - ("key2row", lambda p: srsly.write_msgpack(p, self.key2row)) - )) + serializers = { + "vectors": lambda p: save_array(self.data, p.open("wb")), + "key2row": lambda p: srsly.write_msgpack(p, self.key2row) + } return util.to_disk(path, serializers, []) def from_disk(self, path, **kwargs): @@ -435,11 +430,11 @@ cdef class Vectors: if path.exists(): self.data = xp.load(str(path)) - serializers = OrderedDict(( - ("key2row", load_key2row), - ("keys", load_keys), - ("vectors", load_vectors), - )) + serializers = { + "key2row": load_key2row, + "keys": load_keys, + "vectors": load_vectors, + } util.from_disk(path, serializers, []) return self @@ -457,10 +452,10 @@ cdef class Vectors: else: return srsly.msgpack_dumps(self.data) - serializers = OrderedDict(( - ("key2row", lambda: srsly.msgpack_dumps(self.key2row)), - ("vectors", serialize_weights) - )) + serializers = { + "key2row": lambda: srsly.msgpack_dumps(self.key2row), + "vectors": serialize_weights + } return util.to_bytes(serializers, []) def from_bytes(self, data, **kwargs): @@ -478,9 +473,9 @@ cdef class Vectors: else: self.data = srsly.msgpack_loads(b) - deserializers = OrderedDict(( - ("key2row", lambda b: self.key2row.update(srsly.msgpack_loads(b))), - ("vectors", deserialize_weights) - )) + deserializers = { + "key2row": lambda b: self.key2row.update(srsly.msgpack_loads(b)), + "vectors": deserialize_weights + } util.from_bytes(data, deserializers, []) return self diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3cf0095ee..c7e74f36c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,10 +1,7 @@ -# coding: utf8 # cython: profile=True -from __future__ import unicode_literals from libc.string cimport memcpy import srsly -from collections import OrderedDict from thinc.neural.util import get_array_module from .lexeme cimport EMPTY_LEXEME @@ -14,7 +11,7 @@ from .tokens.token cimport Token from .attrs cimport PROB, LANG, ORTH, TAG, POS from .structs cimport SerializedLexemeC -from .compat import copy_reg, basestring_ +from .compat import copy_reg from .errors import Errors from .lemmatizer import Lemmatizer from .attrs import intify_attrs, NORM @@ -335,14 +332,14 @@ cdef class Vocab: """Retrieve a vector for a word in the vocabulary. Words can be looked up by string or int ID. If no vectors data is loaded, ValueError is raised. - - If `minn` is defined, then the resulting vector uses Fasttext's + + If `minn` is defined, then the resulting vector uses Fasttext's subword features by average over ngrams of `orth`. orth (int / unicode): The hash value of a word, or its unicode string. - minn (int): Minimum n-gram length used for Fasttext's ngram computation. + minn (int): Minimum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. - maxn (int): Maximum n-gram length used for Fasttext's ngram computation. + maxn (int): Maximum n-gram length used for Fasttext's ngram computation. Defaults to the length of `orth`. RETURNS (numpy.ndarray): A word vector. Size and shape determined by the `vocab.vectors` instance. Usually, a @@ -350,7 +347,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#get_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) word = self[orth].orth_ if orth in self.vectors.key2row: @@ -397,7 +394,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#set_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) if self.vectors.is_full and orth not in self.vectors: new_rows = max(100, int(self.vectors.shape[0]*1.3)) @@ -419,7 +416,7 @@ cdef class Vocab: DOCS: https://spacy.io/api/vocab#has_vector """ - if isinstance(orth, basestring_): + if isinstance(orth, str): orth = self.strings.add(orth) return orth in self.vectors @@ -488,12 +485,12 @@ cdef class Vocab: else: return self.vectors.to_bytes() - getters = OrderedDict(( - ("strings", lambda: self.strings.to_bytes()), - ("lexemes", lambda: self.lexemes_to_bytes()), - ("vectors", deserialize_vectors), - ("lookups", lambda: self.lookups.to_bytes()) - )) + getters = { + "strings": lambda: self.strings.to_bytes(), + "lexemes": lambda: self.lexemes_to_bytes(), + "vectors": deserialize_vectors, + "lookups": lambda: self.lookups.to_bytes() + } exclude = util.get_serialization_exclude(getters, exclude, kwargs) return util.to_bytes(getters, exclude) @@ -512,12 +509,12 @@ cdef class Vocab: else: return self.vectors.from_bytes(b) - setters = OrderedDict(( - ("strings", lambda b: self.strings.from_bytes(b)), - ("lexemes", lambda b: self.lexemes_from_bytes(b)), - ("vectors", lambda b: serialize_vectors(b)), - ("lookups", lambda b: self.lookups.from_bytes(b)) - )) + setters = { + "strings": lambda b: self.strings.from_bytes(b), + "lexemes": lambda b: self.lexemes_from_bytes(b), + "vectors": lambda b: serialize_vectors(b), + "lookups": lambda b: self.lookups.from_bytes(b) + } exclude = util.get_serialization_exclude(setters, exclude, kwargs) util.from_bytes(bytes_data, setters, exclude) if self.vectors.name is not None: diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 50ba0e3d9..c9c7a010c 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -677,50 +677,3 @@ of one entity) or when merging spans with | ----------- | -------- | -------------------- | | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | - -## Compatibility functions {#compat source="spacy/compaty.py"} - -All Python code is written in an **intersection of Python 2 and Python 3**. This -is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or -platform compatibility only lives in `spacy.compat`. To distinguish them from -the builtin functions, replacement functions are suffixed with an underscore, -e.g. `unicode_`. - -> #### Example -> -> ```python -> from spacy.compat import unicode_ -> -> compatible_unicode = unicode_("hello world") -> ``` - -| Name | Python 2 | Python 3 | -| -------------------- | ---------------------------------- | ----------- | -| `compat.bytes_` | `str` | `bytes` | -| `compat.unicode_` | `unicode` | `str` | -| `compat.basestring_` | `basestring` | `str` | -| `compat.input_` | `raw_input` | `input` | -| `compat.path2str` | `str(path)` with `.decode('utf8')` | `str(path)` | - -### compat.is_config {#compat.is_config tag="function"} - -Check if a specific configuration of Python version and operating system matches -the user's setup. Mostly used to display targeted error messages. - -> #### Example -> -> ```python -> from spacy.compat import is_config -> -> if is_config(python2=True, windows=True): -> print("You are using Python 2 on Windows.") -> ``` - -| Name | Type | Description | -| ----------- | ---- | ---------------------------------------------------------------- | -| `python2` | bool | spaCy is executed with Python 2.x. | -| `python3` | bool | spaCy is executed with Python 3.x. | -| `windows` | bool | spaCy is executed on Windows. | -| `linux` | bool | spaCy is executed on Linux. | -| `osx` | bool | spaCy is executed on OS X or macOS. | -| **RETURNS** | bool | Whether the specified configuration matches the user's platform. | diff --git a/website/docs/usage/index.md b/website/docs/usage/index.md index 2b0045bc3..6c398d584 100644 --- a/website/docs/usage/index.md +++ b/website/docs/usage/index.md @@ -8,9 +8,9 @@ menu: - ['Changelog', 'changelog'] --- -spaCy is compatible with **64-bit CPython 2.7 / 3.5+** and runs on -**Unix/Linux**, **macOS/OS X** and **Windows**. The latest spaCy releases are -available over [pip](https://pypi.python.org/pypi/spacy) and +spaCy is compatible with **64-bit CPython 3.6+** and runs on **Unix/Linux**, +**macOS/OS X** and **Windows**. The latest spaCy releases are available over +[pip](https://pypi.python.org/pypi/spacy) and [conda](https://anaconda.org/conda-forge/spacy). > #### 📖 Looking for the old docs? @@ -207,14 +207,7 @@ Install a version of the [Visual C++ Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/) or [Visual Studio Express](https://www.visualstudio.com/vs/visual-studio-express/) -that matches the version that was used to compile your Python interpreter. For -official distributions these are: - -| Distribution | Version | -| ------------ | ------------------ | -| Python 2.7 | Visual Studio 2008 | -| Python 3.4 | Visual Studio 2010 | -| Python 3.5+ | Visual Studio 2015 | +that matches the version that was used to compile your Python interpreter. ### Run tests {#run-tests} diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index b7b840999..7382f2b8c 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -367,7 +367,7 @@ tokens and a conditional message based on the document length. import spacy def my_component(doc): - print("After tokenization, this doc has {} tokens.".format(len(doc))) + print(f"After tokenization, this doc has {len(doc)} tokens.") print("The part-of-speech tags are:", [token.pos_ for token in doc]) if len(doc) < 10: print("This is a pretty short document.") @@ -602,7 +602,7 @@ There are three main types of extensions, which can be defined using the [these examples](/usage/examples#custom-components-attr-methods). ```python - Doc.set_extension("hello", method=lambda doc, name: "Hi {}!".format(name)) + Doc.set_extension("hello", method=lambda doc, name: f"Hi {name}!") assert doc._.hello("Bob") == "Hi Bob!" ``` diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 5a3a95a53..479bdd264 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -304,12 +304,6 @@ print(doc.vocab.strings["coffee"]) # 3197928453018144401 print(doc.vocab.strings[3197928453018144401]) # 'coffee' ``` -> #### What does 'L' at the end of a hash mean? -> -> If you return a hash value in the **Python 2 interpreter**, it'll show up as -> `3197928453018144401L`. The `L` just means "long integer" – it's **not** -> actually a part of the hash value. - Now that all strings are encoded, the entries in the vocabulary **don't need to include the word text** themselves. Instead, they can look it up in the `StringStore` via its hash value. Each entry in the vocabulary, also called @@ -857,17 +851,16 @@ def put_spans_around_tokens(doc): and you can calculate what you need, e.g.
,

etc.) """ output = [] - html = '{word}{space}' for token in doc: if token.is_space: output.append(token.text) else: - classes = "pos-{} dep-{}".format(token.pos_, token.dep_) - output.append(html.format(classes=classes, word=token.text, space=token.whitespace_)) + classes = f"pos-{token.pos_} dep-{token.dep_}" + output.append(f'{token.text}{token.whitespace_}') string = "".join(output) string = string.replace("\\n", "") string = string.replace("\\t", " ") - return "

{}
".format(string) + return f"
{string}
" nlp = spacy.load("en_core_web_sm")