From 9211f30ee3704957a58cf821a1e6cfe77c2c1ce5 Mon Sep 17 00:00:00 2001 From: Mehdi Hamoumi Date: Tue, 19 Mar 2019 13:00:19 +0100 Subject: [PATCH 01/14] Tiny correction in french lookup dictionary (#3427) --- spacy/lang/fr/lemmatizer/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/fr/lemmatizer/lookup.py b/spacy/lang/fr/lemmatizer/lookup.py index c032ffdc3..418ab8758 100644 --- a/spacy/lang/fr/lemmatizer/lookup.py +++ b/spacy/lang/fr/lemmatizer/lookup.py @@ -139784,7 +139784,7 @@ LOOKUP = { "nomadisant": ("nomadiser",), "nomadisent": ("nomadiser",), "nomadismes": ("nomadisme",), - "nombres": ("nombrer",), + "nombres": ("nombre",), "nombreuse": ("nombreux",), "nombreuses": ("nombreux",), "nombrilismes": ("nombrilisme",), From 6db1ddd9c7776cf07222ae58dc9b2c44135ac59a Mon Sep 17 00:00:00 2001 From: Bharat123Rox Date: Tue, 19 Mar 2019 23:02:58 +0530 Subject: [PATCH 02/14] Raise ValueError for narrow unicode build --- spacy/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 3498aafb7..726283e01 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals import warnings +import sys warnings.filterwarnings("ignore", message="numpy.dtype size changed") warnings.filterwarnings("ignore", message="numpy.ufunc size changed") @@ -14,6 +15,12 @@ from .about import __version__ from .errors import Warnings, deprecation_warning from . import util +if __version__ >= '2.1.0' and sys.maxunicode <= 65535: + raise ValueError('''You are running a narrow unicode build, + which is incompatible with spacy >= 2.1.0, reinstall Python and use a + wide unicode build instead. You can also rebuild Python and + set the --enable-unicode=ucs4 flag.''') + def load(name, **overrides): depr_path = overrides.get("path") From b5f077dcf454a6cde64e743781c4a2339151b754 Mon Sep 17 00:00:00 2001 From: Bharat123Rox Date: Tue, 19 Mar 2019 23:07:54 +0530 Subject: [PATCH 03/14] Sign the Contributor Agreement and update details --- .github/CONTRIBUTOR_AGREEMENT.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index da9f244eb..5e2ad915b 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [ ] I am signing on behalf of myself as an individual and no other person + * [x] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | | +| Name | Bharat Raghunathan | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | | -| GitHub username | | +| Date | 19 - 03 - 2019 | +| GitHub username | Bharat123rox | | Website (optional) | | From 583a56684317735d0946075d03b5bea42fc6d293 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Mar 2019 22:03:27 +0100 Subject: [PATCH 04/14] Add --always-link flag to cli.download (see #3435) --- spacy/cli/download.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 66a47823c..1ebdb66b2 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -19,9 +19,10 @@ msg = Printer() @plac.annotations( model=("Model to download (shortcut or name)", "positional", None, str), direct=("Force direct download of name + version", "flag", "d", bool), - pip_args=("additional arguments to be passed to `pip install` on model install"), + always_link=("Always create symlinks, even if not shortcut", "flag", "l", bool), + pip_args=("Additional arguments to be passed to `pip install` on model install"), ) -def download(model, direct=False, *pip_args): +def download(model, direct=False, always_link=False, *pip_args): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name @@ -48,7 +49,7 @@ def download(model, direct=False, *pip_args): # Only create symlink if the model is installed via a shortcut like 'en'. # There's no real advantage over an additional symlink for en_core_web_sm # and if anything, it's more error prone and causes more confusion. - if model in shortcuts: + if model in shortcuts or always_link: try: # Get package path here because link uses # pip.get_installed_distributions() to check if model is a From 02d7b41893a98800c3a9514234a1d525868e611e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Mar 2019 00:59:27 +0100 Subject: [PATCH 05/14] Fix GPU installation. Closes #3437 --- setup.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 34c92ad2b..6f29e1efa 100755 --- a/setup.py +++ b/setup.py @@ -238,12 +238,12 @@ def setup_package(): ], setup_requires=["wheel"], extras_require={ - "cuda": ["cupy>=4.0"], - "cuda80": ["cupy-cuda80>=4.0"], - "cuda90": ["cupy-cuda90>=4.0"], - "cuda91": ["cupy-cuda91>=4.0"], - "cuda92": ["cupy-cuda92>=4.0"], - "cuda100": ["cupy-cuda100>=4.0"], + "cuda": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy>=5.0.0b4"], + "cuda80": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda80>=5.0.0b4"], + "cuda90": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda90>=5.0.0b4"], + "cuda91": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda91>=5.0.0b4"], + "cuda92": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda92>=5.0.0b4"], + "cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"], # Language tokenizers with external dependencies "ja": ["mecab-python3==0.7"], }, From 5a53e9358a2faa4f091fab196b48a66bbb6eb07f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Mar 2019 00:59:45 +0100 Subject: [PATCH 06/14] Set version to 2.1.1 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 28c97da50..7b1f36c40 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.0" +__version__ = "2.1.1" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From 685fff40cfb8987a1a07bb60fffea429ff929b98 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Mar 2019 01:03:40 +0100 Subject: [PATCH 07/14] Revert "Add --always-link flag to cli.download (see #3435)" This reverts commit 583a56684317735d0946075d03b5bea42fc6d293. --- spacy/cli/download.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 1ebdb66b2..66a47823c 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -19,10 +19,9 @@ msg = Printer() @plac.annotations( model=("Model to download (shortcut or name)", "positional", None, str), direct=("Force direct download of name + version", "flag", "d", bool), - always_link=("Always create symlinks, even if not shortcut", "flag", "l", bool), - pip_args=("Additional arguments to be passed to `pip install` on model install"), + pip_args=("additional arguments to be passed to `pip install` on model install"), ) -def download(model, direct=False, always_link=False, *pip_args): +def download(model, direct=False, *pip_args): """ Download compatible model from default download path using pip. Model can be shortcut, model name or, if --direct flag is set, full model name @@ -49,7 +48,7 @@ def download(model, direct=False, always_link=False, *pip_args): # Only create symlink if the model is installed via a shortcut like 'en'. # There's no real advantage over an additional symlink for en_core_web_sm # and if anything, it's more error prone and causes more confusion. - if model in shortcuts or always_link: + if model in shortcuts: try: # Get package path here because link uses # pip.get_installed_distributions() to check if model is a From 7400c7f8a7eefe171602d6e5f148b82eef6d0693 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Mar 2019 01:19:34 +0100 Subject: [PATCH 08/14] Move UD scripts to bin --- {spacy/cli => bin}/ud/__init__.py | 0 {spacy/cli => bin}/ud/conll17_ud_eval.py | 0 {spacy/cli => bin}/ud/run_eval.py | 0 {spacy/cli => bin}/ud/ud_run_test.py | 22 +++++++++++----------- {spacy/cli => bin}/ud/ud_train.py | 18 +++++++++--------- spacy/cli/__init__.py | 1 - 6 files changed, 20 insertions(+), 21 deletions(-) rename {spacy/cli => bin}/ud/__init__.py (100%) rename {spacy/cli => bin}/ud/conll17_ud_eval.py (100%) rename {spacy/cli => bin}/ud/run_eval.py (100%) rename {spacy/cli => bin}/ud/ud_run_test.py (96%) rename {spacy/cli => bin}/ud/ud_train.py (98%) diff --git a/spacy/cli/ud/__init__.py b/bin/ud/__init__.py similarity index 100% rename from spacy/cli/ud/__init__.py rename to bin/ud/__init__.py diff --git a/spacy/cli/ud/conll17_ud_eval.py b/bin/ud/conll17_ud_eval.py similarity index 100% rename from spacy/cli/ud/conll17_ud_eval.py rename to bin/ud/conll17_ud_eval.py diff --git a/spacy/cli/ud/run_eval.py b/bin/ud/run_eval.py similarity index 100% rename from spacy/cli/ud/run_eval.py rename to bin/ud/run_eval.py diff --git a/spacy/cli/ud/ud_run_test.py b/bin/ud/ud_run_test.py similarity index 96% rename from spacy/cli/ud/ud_run_test.py rename to bin/ud/ud_run_test.py index 35c878721..b6307f799 100644 --- a/spacy/cli/ud/ud_run_test.py +++ b/bin/ud/ud_run_test.py @@ -13,14 +13,14 @@ import srsly import spacy import spacy.util -from ...tokens import Token, Doc -from ...gold import GoldParse -from ...util import compounding, minibatch_by_words -from ...syntax.nonproj import projectivize -from ...matcher import Matcher +from spacy.tokens import Token, Doc +from spacy.gold import GoldParse +from spacy.util import compounding, minibatch_by_words +from spacy.syntax.nonproj import projectivize +from spacy.matcher import Matcher -# from ...morphology import Fused_begin, Fused_inside -from ... import displacy +# from spacy.morphology import Fused_begin, Fused_inside +from spacy import displacy from collections import defaultdict, Counter from timeit import default_timer as timer @@ -33,10 +33,10 @@ import numpy.random from . import conll17_ud_eval -from ... import lang -from ...lang import zh -from ...lang import ja -from ...lang import ru +from spacy import lang +from spacy.lang import zh +from spacy.lang import ja +from spacy.lang import ru ################ diff --git a/spacy/cli/ud/ud_train.py b/bin/ud/ud_train.py similarity index 98% rename from spacy/cli/ud/ud_train.py rename to bin/ud/ud_train.py index 563fcfb87..0600ab0ff 100644 --- a/spacy/cli/ud/ud_train.py +++ b/bin/ud/ud_train.py @@ -13,12 +13,12 @@ import json import spacy import spacy.util -from ...tokens import Token, Doc -from ...gold import GoldParse -from ...util import compounding, minibatch, minibatch_by_words -from ...syntax.nonproj import projectivize -from ...matcher import Matcher -from ... import displacy +from spacy.tokens import Token, Doc +from spacy.gold import GoldParse +from spacy.util import compounding, minibatch, minibatch_by_words +from spacy.syntax.nonproj import projectivize +from spacy.matcher import Matcher +from spacy import displacy from collections import defaultdict, Counter from timeit import default_timer as timer @@ -28,9 +28,9 @@ import numpy.random from . import conll17_ud_eval -from ... import lang -from ...lang import zh -from ...lang import ja +from spacy import lang +from spacy.lang import zh +from spacy.lang import ja try: import torch diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index 4ab1c7c55..778453711 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -10,4 +10,3 @@ from .evaluate import evaluate # noqa: F401 from .convert import convert # noqa: F401 from .init_model import init_model # noqa: F401 from .validate import validate # noqa: F401 -from .ud import ud_train, ud_evaluate # noqa: F401 From f2547f02d64a83fbb30854c2a237fc5aeb8ef0e9 Mon Sep 17 00:00:00 2001 From: Bharat123Rox Date: Wed, 20 Mar 2019 07:43:19 +0530 Subject: [PATCH 09/14] Made changes suggested by @ines --- .github/CONTRIBUTOR_AGREEMENT.md | 8 +- .github/contributors/Bharat123rox.md | 106 +++++++++++++++++++++++++++ spacy/__init__.py | 9 +-- spacy/errors.py | 4 + 4 files changed, 117 insertions(+), 10 deletions(-) create mode 100644 .github/contributors/Bharat123rox.md diff --git a/.github/CONTRIBUTOR_AGREEMENT.md b/.github/CONTRIBUTOR_AGREEMENT.md index 5e2ad915b..da9f244eb 100644 --- a/.github/CONTRIBUTOR_AGREEMENT.md +++ b/.github/CONTRIBUTOR_AGREEMENT.md @@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply. 7. Please place an “x” on one of the applicable statement below. Please do NOT mark both statements: - * [x] I am signing on behalf of myself as an individual and no other person + * [ ] I am signing on behalf of myself as an individual and no other person or entity, including my employer, has or will have rights with respect to my contributions. @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Bharat Raghunathan | +| Name | | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 19 - 03 - 2019 | -| GitHub username | Bharat123rox | +| Date | | +| GitHub username | | | Website (optional) | | diff --git a/.github/contributors/Bharat123rox.md b/.github/contributors/Bharat123rox.md new file mode 100644 index 000000000..5e2ad915b --- /dev/null +++ b/.github/contributors/Bharat123rox.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Bharat Raghunathan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 19 - 03 - 2019 | +| GitHub username | Bharat123rox | +| Website (optional) | | diff --git a/spacy/__init__.py b/spacy/__init__.py index 726283e01..81604c4c2 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -12,14 +12,11 @@ from thinc.neural.util import prefer_gpu, require_gpu from .cli.info import info as cli_info from .glossary import explain from .about import __version__ -from .errors import Warnings, deprecation_warning +from .errors import Errors, Warnings, deprecation_warning from . import util -if __version__ >= '2.1.0' and sys.maxunicode <= 65535: - raise ValueError('''You are running a narrow unicode build, - which is incompatible with spacy >= 2.1.0, reinstall Python and use a - wide unicode build instead. You can also rebuild Python and - set the --enable-unicode=ucs4 flag.''') +if sys.maxunicode == 65535: + raise SystemError(Errors.E130) def load(name, **overrides): diff --git a/spacy/errors.py b/spacy/errors.py index 1fca7060b..82f17fea8 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -367,6 +367,10 @@ class Errors(object): "Instead, create a new Span object and specify the `label` keyword argument, " "for example:\nfrom spacy.tokens import Span\n" "span = Span(doc, start={start}, end={end}, label='{label}')") + E130 = ("You are running a narrow unicode build, " + "which is incompatible with spacy >= 2.1.0, reinstall Python and " + "use a wide unicode build instead. You can also rebuild Python " + "and set the --enable-unicode=ucs4 flag.") @add_codes From 6abc1ddb266e6a0627e4d7ac41d709132be709c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Mar 2019 09:43:26 +0100 Subject: [PATCH 10/14] Update __main__.py --- spacy/__main__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index a1679d7fd..716561566 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -9,8 +9,7 @@ if __name__ == "__main__": import sys from wasabi import Printer from spacy.cli import download, link, info, package, train, pretrain, convert - from spacy.cli import init_model, profile, evaluate, validate - from spacy.cli import ud_train, ud_evaluate, debug_data + from spacy.cli import init_model, profile, evaluate, validate, debug_data msg = Printer() @@ -21,9 +20,7 @@ if __name__ == "__main__": "train": train, "pretrain": pretrain, "debug-data": debug_data, - "ud-train": ud_train, "evaluate": evaluate, - "ud-evaluate": ud_evaluate, "convert": convert, "package": package, "init-model": init_model, From ae5b4d0e84b80494538d0577bfec91f9ffedb993 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Mar 2019 09:55:45 +0100 Subject: [PATCH 11/14] Fix formatting (hopefully also restarts build properly) --- spacy/__init__.py | 1 + spacy/errors.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 81604c4c2..9edbab198 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -15,6 +15,7 @@ from .about import __version__ from .errors import Errors, Warnings, deprecation_warning from . import util + if sys.maxunicode == 65535: raise SystemError(Errors.E130) diff --git a/spacy/errors.py b/spacy/errors.py index 82f17fea8..b63c46919 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -367,10 +367,10 @@ class Errors(object): "Instead, create a new Span object and specify the `label` keyword argument, " "for example:\nfrom spacy.tokens import Span\n" "span = Span(doc, start={start}, end={end}, label='{label}')") - E130 = ("You are running a narrow unicode build, " - "which is incompatible with spacy >= 2.1.0, reinstall Python and " - "use a wide unicode build instead. You can also rebuild Python " - "and set the --enable-unicode=ucs4 flag.") + E130 = ("You are running a narrow unicode build, which is incompatible " + "with spacy >= 2.1.0. To fix this, reinstall Python and use a wide " + "unicode build instead. You can also rebuild Python and set the " + "--enable-unicode=ucs4 flag.") @add_codes From 1612990e888385df9dd4bab28071b0ff4ed69745 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Mar 2019 11:06:35 +0000 Subject: [PATCH 12/14] Implement cosine loss for spacy pretrain. Make default --- spacy/cli/pretrain.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 0ea895597..e44af8b48 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -9,7 +9,7 @@ from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout from thinc.misc import LayerNorm as LN -from thinc.neural.util import prefer_gpu +from thinc.neural.util import prefer_gpu, get_array_module from wasabi import Printer import srsly @@ -27,6 +27,7 @@ from .. import util width=("Width of CNN layers", "option", "cw", int), depth=("Depth of CNN layers", "option", "cd", int), embed_rows=("Embedding rows", "option", "er", int), + loss_func=("Loss to use for the objective. L2 or cosine", "option", "L", str), use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), dropout=("Dropout", "option", "d", float), batch_size=("Number of words per training batch", "option", "bs", int), @@ -42,6 +43,7 @@ def pretrain( width=96, depth=4, embed_rows=2000, + loss_func="cosine", use_vectors=False, dropout=0.2, nr_iter=1000, @@ -123,7 +125,7 @@ def pretrain( max_length=max_length, min_length=min_length, ) - loss = make_update(model, docs, optimizer, drop=dropout) + loss = make_update(model, docs, optimizer, objective=loss_func, drop=dropout) progress = tracker.update(epoch, loss, docs) if progress: msg.row(progress, **row_settings) @@ -196,11 +198,26 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) target = docs[0].vocab.vectors.data[ids] if objective == "L2": - d_scores = prediction - target - loss = (d_scores ** 2).sum() - else: - raise NotImplementedError(objective) - return loss, d_scores + d_target = prediction - target + loss = (d_target ** 2).sum() + elif objective == "cosine": + loss, d_target = get_cossim_loss(prediction, target) + return loss, d_target + + +def get_cossim_loss(yh, y): + # Add a small constant to avoid 0 vectors + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cosine = (yh * y).sum(axis=1, keepdims=True) / mul_norms + d_yh = (y / mul_norms) - (cosine * (yh / norm_yh**2)) + loss = xp.abs(cosine-1).sum() + return loss, -d_yh def create_pretraining_model(nlp, tok2vec): From 72889a16d558848191e51f4bfb200e70d3bc413a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Mar 2019 12:09:59 +0100 Subject: [PATCH 13/14] Fix similarity calculation if vectors are on GPU (#3440) --- spacy/tokens/doc.pyx | 7 ++++--- spacy/tokens/span.pyx | 10 ++++------ spacy/tokens/token.pyx | 4 +++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index dd610bd6d..d4d7e5fa4 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -416,8 +416,9 @@ cdef class Doc: return self.user_hooks["vector"](self) if self._vector is not None: return self._vector - elif not len(self): - self._vector = numpy.zeros((self.vocab.vectors_length,), dtype="f") + xp = get_array_module(self.vocab.vectors.data) + if not len(self): + self._vector = xp.zeros((self.vocab.vectors_length,), dtype="f") return self._vector elif self.vocab.vectors.data.size > 0: self._vector = sum(t.vector for t in self) / len(self) @@ -426,7 +427,7 @@ cdef class Doc: self._vector = self.tensor.mean(axis=0) return self._vector else: - return numpy.zeros((self.vocab.vectors_length,), dtype="float32") + return xp.zeros((self.vocab.vectors_length,), dtype="float32") def __set__(self, value): self._vector = value diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 36eaeb568..e62caed40 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -420,13 +420,11 @@ cdef class Span: """ if "vector_norm" in self.doc.user_span_hooks: return self.doc.user_span_hooks["vector"](self) - cdef float value - cdef double norm = 0 + vector = self.vector + xp = get_array_module(vector) if self._vector_norm is None: - norm = 0 - for value in self.vector: - norm += value * value - self._vector_norm = sqrt(norm) if norm != 0 else 0 + total = (vector*vector).sum() + self._vector_norm = xp.sqrt(total) if total != 0. else 0. return self._vector_norm @property diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 409b68290..66728d35c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -404,7 +404,9 @@ cdef class Token: if "vector_norm" in self.doc.user_token_hooks: return self.doc.user_token_hooks["vector_norm"](self) vector = self.vector - return numpy.sqrt((vector ** 2).sum()) + xp = get_array_module(vector) + total = (vector ** 2).sum() + return xp.sqrt(total) if total != 0. else 0. @property def n_lefts(self): From dac8f8ff99914d25843933d6440ea6f36e610884 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Mar 2019 17:24:17 +0100 Subject: [PATCH 14/14] Update Span.__init__ docs (see #3445) [ci skip] --- website/docs/api/span.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 43924a2b5..6a22d6a8a 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -18,14 +18,14 @@ Create a Span object from the `slice doc[start : end]`. > assert [t.text for t in span] == [u"it", u"back", u"!"] > ``` -| Name | Type | Description | -| ----------- | ---------------------------------------- | ------------------------------------------------------- | -| `doc` | `Doc` | The parent document. | -| `start` | int | The index of the first token of the span. | -| `end` | int | The index of the first token after the span. | -| `label` | int | A label to attach to the span, e.g. for named entities. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | -| **RETURNS** | `Span` | The newly constructed object. | +| Name | Type | Description | +| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The parent document. | +| `start` | int | The index of the first token of the span. | +| `end` | int | The index of the first token after the span. | +| `label` | int / unicode | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a unicode string. | +| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| **RETURNS** | `Span` | The newly constructed object. | ## Span.\_\_getitem\_\_ {#getitem tag="method"}