Update feature/lemmatizer from develop

2025-11-22 10:45:45 +03:00 · 2019-03-10 02:45:33 +01:00 · 2019-03-10 02:45:33 +01:00 · 78aba46530
commit 78aba46530
parent 5431c47b91 ce1fe8a510
12 changed files with 207 additions and 97 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -1,21 +0,0 @@
-environment:
-  matrix:
-    - PYTHON: "C:\\Python35-x64"
-    - PYTHON: "C:\\Python36-x64"
-    - PYTHON: "C:\\Python37-x64"
-install:
-  # We need wheel installed to build wheels
-  - "%PYTHON%\\python.exe -m pip install wheel"
-  - "%PYTHON%\\python.exe -m pip install cython"
-  - "%PYTHON%\\python.exe -m pip install -r requirements.txt"
-  - "%PYTHON%\\python.exe -m pip install -e ."
-build: off
-test_script:
-  - "%PYTHON%\\python.exe -m pytest spacy/ --no-print-logs"
-after_test:
-  - "%PYTHON%\\python.exe setup.py bdist_wheel"
-artifacts:
-  - path: dist\*
-branches:
-  except:
-    - spacy.io
--- a/.travis.yml
+++ b/.travis.yml
@ -5,23 +5,16 @@ dist: trusty
 group: edge
 python:
   - "2.7"
-   - "3.5"
-   - "3.6"
 os:
  - linux
-env:
-  - VIA=compile
-  - VIA=flake8
 install:
-  - "./travis.sh"
-  - pip install flake8
+  - "pip install -r requirements.txt"
+  - "python setup.py build_ext --inplace"
+  - "pip install -e ."
 script:
  - "cat /proc/cpuinfo | grep flags | head -n 1"
  - "pip install pytest pytest-timeout"
-  - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi
-  - if [[ "${VIA}" == "flake8" ]]; then flake8 . --count --exclude=spacy/compat.py,spacy/lang --select=E901,E999,F821,F822,F823 --show-source --statistics; fi
-  - if [[ "${VIA}" == "pypi_nightly" ]]; then python -m pytest --tb=native --models --en `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
-  - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi
+  - "python -m pytest --tb=native spacy"
 branches:
  except:
    - spacy.io
--- a/README.md
+++ b/README.md
@ -14,8 +14,8 @@ released under the MIT license.

 💫 **Version 2.1 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases)

+[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 [![Travis Build Status](https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square&logo=travis)](https://travis-ci.org/explosion/spaCy)
-[![Appveyor Build Status](https://img.shields.io/appveyor/ci/explosion/spaCy/master.svg?style=flat-square&logo=appveyor)](https://ci.appveyor.com/project/explosion/spaCy)
 [![Current Release Version](https://img.shields.io/github/release/explosion/spacy.svg?style=flat-square)](https://github.com/explosion/spaCy/releases)
 [![pypi Version](https://img.shields.io/pypi/v/spacy.svg?style=flat-square)](https://pypi.python.org/pypi/spacy)
 [![conda Version](https://img.shields.io/conda/vn/conda-forge/spacy.svg?style=flat-square)](https://anaconda.org/conda-forge/spacy)
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -0,0 +1,92 @@
+trigger:
+  batch: true
+  branches:
+    include:
+    - '*'
+    exclude:
+    - 'spacy.io'
+  paths:
+    exclude:
+    - 'website/*'
+    - '*.md'
+
+jobs:
+
+# Perform basic checks for most important errors (syntax etc.) Uses the config
+# defined in .flake8 and overwrites the selected codes.
+- job: 'Validate'
+  pool:
+    vmImage: 'ubuntu-16.04'
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '3.7'
+  - script: |
+      pip install flake8
+      python -m flake8 spacy --count --select=E901,E999,F821,F822,F823 --show-source --statistics
+    displayName: 'flake8'
+
+- job: 'Test'
+  dependsOn: 'Validate'
+  strategy:
+    matrix:
+      # Python 2.7 currently doesn't work because it seems to be a narrow
+      # unicode build, which causes problems with the regular expressions
+
+      # Python27Linux:
+      #   imageName: 'ubuntu-16.04'
+      #   python.version: '2.7'
+      # Python27Mac:
+      #   imageName: 'macos-10.13'
+      #   python.version: '2.7'
+      Python35Linux:
+        imageName: 'ubuntu-16.04'
+        python.version: '3.5'
+      Python35Windows:
+        imageName: 'vs2017-win2016'
+        python.version: '3.5'
+      Python35Mac:
+        imageName: 'macos-10.13'
+        python.version: '3.5'
+      Python36Linux:
+        imageName: 'ubuntu-16.04'
+        python.version: '3.6'
+      Python36Windows:
+        imageName: 'vs2017-win2016'
+        python.version: '3.6'
+      Python36Mac:
+        imageName: 'macos-10.13'
+        python.version: '3.6'
+      Python37Linux:
+        imageName: 'ubuntu-16.04'
+        python.version: '3.7'
+      Python37Windows:
+        imageName: 'vs2017-win2016'
+        python.version: '3.7'
+      Python37Mac:
+        imageName: 'macos-10.13'
+        python.version: '3.7'
+    maxParallel: 4
+  pool:
+    vmImage: $(imageName)
+
+  steps:
+  - task: UsePythonVersion@0
+    inputs:
+      versionSpec: '$(python.version)'
+      architecture: 'x64'
+
+  # Downgrading pip is necessary to prevent a wheel version incompatiblity.
+  # Might be fixed in the future or some other way, so investigate again.
+  - script: |
+      python -m pip install --upgrade pip==18.1
+      pip install -r requirements.txt
+    displayName: 'Install dependencies'
+
+  - script: |
+      python setup.py build_ext --inplace
+      pip install -e .
+    displayName: 'Build and install'
+
+  - script: python -m pytest --tb=native spacy
+    displayName: 'Run tests'
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -84,16 +84,54 @@ def _zero_init(model):
@layerize
 def _preprocess_doc(docs, drop=0.0):
    keys = [doc.to_array(LOWER) for doc in docs]
-    ops = Model.ops
    # The dtype here matches what thinc is expecting -- which differs per
    # platform (by int definition). This should be fixed once the problem
    # is fixed on Thinc's side.
-    lengths = ops.asarray([arr.shape[0] for arr in keys], dtype=numpy.int_)
-    keys = ops.xp.concatenate(keys)
-    vals = ops.allocate(keys.shape) + 1.0
+    lengths = numpy.array([arr.shape[0] for arr in keys], dtype=numpy.int_)
+    keys = numpy.concatenate(keys)
+    vals = numpy.zeros(keys.shape, dtype='f')
    return (keys, vals, lengths), None


+def with_cpu(ops, model):
+    """Wrap a model that should run on CPU, transferring inputs and outputs
+    as necessary."""
+    model.to_cpu()
+    def with_cpu_forward(inputs, drop=0.):
+        cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
+        gpu_outputs = _to_device(ops, cpu_outputs)
+
+        def with_cpu_backprop(d_outputs, sgd=None):
+            cpu_d_outputs = _to_cpu(d_outputs)
+            return backprop(cpu_d_outputs, sgd=sgd)
+
+        return gpu_outputs, with_cpu_backprop
+
+    return wrap(with_cpu_forward, model)
+
+
+def _to_cpu(X):
+    if isinstance(X, numpy.ndarray):
+        return X
+    elif isinstance(X, tuple):
+        return tuple([_to_cpu(x) for x in X])
+    elif isinstance(X, list):
+        return [_to_cpu(x) for x in X]
+    elif hasattr(X, 'get'):
+        return X.get()
+    else:
+        return X
+
+
+def _to_device(ops, X):
+    if isinstance(X, tuple):
+        return tuple([_to_device(ops, x) for x in X])
+    elif isinstance(X, list):
+        return [_to_device(ops, x) for x in X]
+    else:
+        return ops.asarray(X)
+
+
@layerize
 def _preprocess_doc_bigrams(docs, drop=0.0):
    unigrams = [doc.to_array(LOWER) for doc in docs]
@ -655,7 +693,10 @@ def build_text_classifier(nr_class, width=64, **cfg):
            >> zero_init(Affine(nr_class, width, drop_factor=0.0))
        )

-        linear_model = _preprocess_doc >> LinearModel(nr_class)
+        linear_model = (
+            _preprocess_doc
+            >> with_cpu(Model.ops, LinearModel(nr_class))
+        )
        if cfg.get("exclusive_classes"):
            output_layer = Softmax(nr_class, nr_class * 2)
        else:
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -23,15 +23,16 @@ CONVERTERS = {
 }

 # File types
-FILE_TYPES = ("json", "jsonl")
+FILE_TYPES = ("json", "jsonl", "msg")
+FILE_TYPES_STDOUT = ("json", "jsonl")


@plac.annotations(
    input_file=("Input file", "positional", None, str),
-    output_dir=("Output directory for converted file", "positional", None, str),
-    file_type=("Type of data to produce: 'jsonl' or 'json'", "option", "t", str),
+    output_dir=("Output directory. '-' for stdout.", "positional", None, str),
+    file_type=("Type of data to produce: {}".format(FILE_TYPES), "option", "t", str),
    n_sents=("Number of sentences per doc", "option", "n", int),
-    converter=("Name of converter (auto, iob, conllu or ner)", "option", "c", str),
+    converter=("Converter: {}".format(tuple(CONVERTERS.keys())), "option", "c", str),
    lang=("Language (if tokenizer required)", "option", "l", str),
    morphology=("Enable appending morphology to tags", "flag", "m", bool),
 )
@ -58,6 +59,13 @@ def convert(
            "Supported file types: '{}'".format(", ".join(FILE_TYPES)),
            exits=1,
        )
+    if file_type not in FILE_TYPES_STDOUT and output_dir == "-":
+        # TODO: support msgpack via stdout in srsly?
+        msg.fail(
+            "Can't write .{} data to stdout.".format(file_type),
+            "Please specify an output directory.",
+            exits=1,
+        )
    if not input_path.exists():
        msg.fail("Input file not found", input_path, exits=1)
    if output_dir != "-" and not Path(output_dir).exists():
@ -78,6 +86,8 @@ def convert(
            srsly.write_json(output_file, data)
        elif file_type == "jsonl":
            srsly.write_jsonl(output_file, data)
+        elif file_type == "msg":
+            srsly.write_msgpack(output_file, data)
        msg.good("Generated output file ({} documents)".format(len(data)), output_file)
    else:
        # Print to stdout
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -342,7 +342,7 @@ class Errors(object):
            "equal to span length ({span_len}).")
    E122 = ("Cannot find token to be split. Did it get merged?")
    E123 = ("Cannot find head of token to be split. Did it get merged?")
-    E124 = ("Cannot read from file: {path}. Supported formats: .json, .msg")
+    E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
    E125 = ("Unexpected value: {value}")
    E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
            "This is likely a bug in spaCy, so feel free to open an issue.")
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -153,10 +153,13 @@ class GoldCorpus(object):
            loc = util.ensure_path(loc)
            if loc.parts[-1].endswith("json"):
                gold_tuples = read_json_file(loc)
+            elif loc.parts[-1].endswith("jsonl"):
+                gold_tuples = srsly.read_jsonl(loc)
            elif loc.parts[-1].endswith("msg"):
                gold_tuples = srsly.read_msgpack(loc)
            else:
-                raise ValueError(Errors.E124.format(path=path2str(loc)))
+                supported = ("json", "jsonl", "msg")
+                raise ValueError(Errors.E124.format(path=path2str(loc), formats=supported))
            for item in gold_tuples:
                yield item
                i += len(item[1])
--- a/spacy/lang/fr/punctuation.py
+++ b/spacy/lang/fr/punctuation.py
@ -21,7 +21,9 @@ _suffixes = (
        r"(?<=[0-9])%",  # 4% -> ["4", "%"]
        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
        r"(?<=[0-9])(?:{u})".format(u=UNITS),
-        r"(?<=[0-9{al}{e}(?:{q})])\.".format(al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES),
+        r"(?<=[0-9{al}{e}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
+        ),
        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
    ]
 )
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -379,7 +379,7 @@ _regular_exp = [
 _regular_exp += [
    "^{prefix}[{hyphen}][{al}][{hyphen}{al}{elision}]*$".format(
        prefix=p,
-        hyphen=HYPHENS,   # putting the - first in the [] range avoids having to use a backslash
+        hyphen=HYPHENS,  # putting the - first in the [] range avoids having to use a backslash
        elision=ELISION,
        al=ALPHA_LOWER,
    )
@ -423,5 +423,5 @@ _regular_exp.append(URL_PATTERN)

 TOKENIZER_EXCEPTIONS = _exc
 TOKEN_MATCH = re.compile(
-    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE
+    "|".join("(?:{})".format(m) for m in _regular_exp), re.IGNORECASE | re.UNICODE
 ).match
--- a/travis.sh
+++ b/travis.sh
@ -1,32 +0,0 @@
-#!/bin/bash
-
-if [ "${VIA}" == "pypi" ]; then
-    rm -rf *
-    pip install spacy-nightly
-    python -m spacy download en
-fi
-
-if [[ "${VIA}" == "sdist" && "${TRAVIS_PULL_REQUEST}" == "false" ]]; then
-  rm -rf *
-  pip uninstall spacy
-  wget https://api.explosion.ai/build/spacy/sdist/$TRAVIS_COMMIT
-  mv $TRAVIS_COMMIT sdist.tgz
-  pip install -U sdist.tgz
-fi
-
-
-if [ "${VIA}" == "compile" ]; then
-  pip install -r requirements.txt
-  python setup.py build_ext --inplace
-  pip install -e .
-fi
-
-#  mkdir -p corpora/en
-#  cd corpora/en
-#  wget --no-check-certificate http://wordnetcode.princeton.edu/3.0/WordNet-3.0.tar.gz
-#  tar -xzf WordNet-3.0.tar.gz
-#  mv WordNet-3.0 wordnet
-#  cd ../../
-#  mkdir models/
-#  python bin/init_model.py en lang_data/ corpora/ models/en
-#fi
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -134,28 +134,50 @@ converter can be specified on the command line, or chosen based on the file
 extension of the input file.

 ```bash
-$ python -m spacy convert [input_file] [output_dir] [--converter] [--n-sents]
-[--morphology]
+$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
+[--n-sents] [--morphology] [--lang]
 ```

-| Argument                                     | Type       | Description                                                |
-| -------------------------------------------- | ---------- | ---------------------------------------------------------- |
-| `input_file`                                 | positional | Input file.                                                |
-| `output_dir`                                 | positional | Output directory for converted JSON file.                  |
-| `converter`, `-c` <Tag variant="new">2</Tag> | option     | Name of converter to use (see below).                      |
-| `--n-sents`, `-n`                            | option     | Number of sentences per document.                          |
-| `--morphology`, `-m`                         | option     | Enable appending morphology to tags.                       |
-| `--help`, `-h`                               | flag       | Show help message and available arguments.                 |
-| **CREATES**                                  | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input). |
+| Argument                                         | Type       | Description                                                                                       |
+| ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------- |
+| `input_file`                                     | positional | Input file.                                                                                       |
+| `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. |
+| `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create (see below).                                                               |
+| `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                             |
+| `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                 |
+| `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                              |
+| `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                            |
+| `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                        |
+| **CREATES**                                      | JSON       | Data in spaCy's [JSON format](/api/annotation#json-input).                                        |

-The following file format converters are available:
+### Output file types {new="2.1"}

-| ID                | Description                                                     |
-| ----------------- | --------------------------------------------------------------- |
-| `auto`            | Automatically pick converter based on file extension (default). |
-| `conllu`, `conll` | Universal Dependencies `.conllu` or `.conll` format.            |
-| `ner`             | Tab-based named entity recognition format.                      |
-| `iob`             | IOB or IOB2 named entity recognition format.                    |
+> #### Which format should I choose?
+>
+> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
+> that there's one JSON object per line. Unlike a regular JSON file, it can also
+> be read in line-by-line and you won't have to parse the _entire file_ first.
+> This makes it a very convenient format for larger corpora.
+
+All output files generated by this command are compatible with
+[`spacy train`](/api/cli#train).
+
+| ID      | Description                       |
+| ------- | --------------------------------- |
+| `jsonl` | Newline-delimited JSON (default). |
+| `json`  | Regular JSON.                     |
+| `msg`   | Binary MessagePack format.        |
+
+### Converter options
+
+<!-- TODO: document jsonl option – maybe update it? -->
+
+| ID                             | Description                                                     |
+| ------------------------------ | --------------------------------------------------------------- |
+| `auto`                         | Automatically pick converter based on file extension (default). |
+| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format.            |
+| `ner`                          | Tab-based named entity recognition format.                      |
+| `iob`                          | IOB or IOB2 named entity recognition format.                    |

 ## Train {#train}