mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-15 12:06:25 +03:00
Merge remote-tracking branch 'upstream/master' into bugfix/tokenizer-special-cases-matcher
This commit is contained in:
commit
cfc318b76c
4
.flake8
4
.flake8
|
@ -6,9 +6,5 @@ exclude =
|
||||||
.env,
|
.env,
|
||||||
.git,
|
.git,
|
||||||
__pycache__,
|
__pycache__,
|
||||||
lemmatizer.py,
|
|
||||||
lookup.py,
|
|
||||||
_tokenizer_exceptions_list.py,
|
_tokenizer_exceptions_list.py,
|
||||||
spacy/lang/fr/lemmatizer,
|
|
||||||
spacy/lang/nb/lemmatizer
|
|
||||||
spacy/__init__.py
|
spacy/__init__.py
|
||||||
|
|
106
.github/contributors/mihaigliga21.md
vendored
Normal file
106
.github/contributors/mihaigliga21.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
# spaCy contributor agreement
|
||||||
|
|
||||||
|
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||||
|
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||||
|
The SCA applies to any contribution that you make to any product or project
|
||||||
|
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||||
|
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||||
|
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||||
|
**"you"** shall mean the person or entity identified below.
|
||||||
|
|
||||||
|
If you agree to be bound by these terms, fill in the information requested
|
||||||
|
below and include the filled-in version with your first pull request, under the
|
||||||
|
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||||
|
should be your GitHub username, with the extension `.md`. For example, the user
|
||||||
|
example_user would create the file `.github/contributors/example_user.md`.
|
||||||
|
|
||||||
|
Read this agreement carefully before signing. These terms and conditions
|
||||||
|
constitute a binding legal agreement.
|
||||||
|
|
||||||
|
## Contributor Agreement
|
||||||
|
|
||||||
|
1. The term "contribution" or "contributed materials" means any source code,
|
||||||
|
object code, patch, tool, sample, graphic, specification, manual,
|
||||||
|
documentation, or any other material posted or submitted by you to the project.
|
||||||
|
|
||||||
|
2. With respect to any worldwide copyrights, or copyright applications and
|
||||||
|
registrations, in your contribution:
|
||||||
|
|
||||||
|
* you hereby assign to us joint ownership, and to the extent that such
|
||||||
|
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||||
|
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||||
|
royalty-free, unrestricted license to exercise all rights under those
|
||||||
|
copyrights. This includes, at our option, the right to sublicense these same
|
||||||
|
rights to third parties through multiple levels of sublicensees or other
|
||||||
|
licensing arrangements;
|
||||||
|
|
||||||
|
* you agree that each of us can do all things in relation to your
|
||||||
|
contribution as if each of us were the sole owners, and if one of us makes
|
||||||
|
a derivative work of your contribution, the one who makes the derivative
|
||||||
|
work (or has it made will be the sole owner of that derivative work;
|
||||||
|
|
||||||
|
* you agree that you will not assert any moral rights in your contribution
|
||||||
|
against us, our licensees or transferees;
|
||||||
|
|
||||||
|
* you agree that we may register a copyright in your contribution and
|
||||||
|
exercise all ownership rights associated with it; and
|
||||||
|
|
||||||
|
* you agree that neither of us has any duty to consult with, obtain the
|
||||||
|
consent of, pay or render an accounting to the other for any use or
|
||||||
|
distribution of your contribution.
|
||||||
|
|
||||||
|
3. With respect to any patents you own, or that you can license without payment
|
||||||
|
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||||
|
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||||
|
|
||||||
|
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||||
|
your contribution in whole or in part, alone or in combination with or
|
||||||
|
included in any product, work or materials arising out of the project to
|
||||||
|
which your contribution was submitted, and
|
||||||
|
|
||||||
|
* at our option, to sublicense these same rights to third parties through
|
||||||
|
multiple levels of sublicensees or other licensing arrangements.
|
||||||
|
|
||||||
|
4. Except as set out above, you keep all right, title, and interest in your
|
||||||
|
contribution. The rights that you grant to us under these terms are effective
|
||||||
|
on the date you first submitted a contribution to us, even if your submission
|
||||||
|
took place before the date you sign these terms.
|
||||||
|
|
||||||
|
5. You covenant, represent, warrant and agree that:
|
||||||
|
|
||||||
|
* Each contribution that you submit is and shall be an original work of
|
||||||
|
authorship and you can legally grant the rights set out in this SCA;
|
||||||
|
|
||||||
|
* to the best of your knowledge, each contribution will not violate any
|
||||||
|
third party's copyrights, trademarks, patents, or other intellectual
|
||||||
|
property rights; and
|
||||||
|
|
||||||
|
* each contribution shall be in compliance with U.S. export control laws and
|
||||||
|
other applicable export and import laws. You agree to notify us if you
|
||||||
|
become aware of any circumstance which would make any of the foregoing
|
||||||
|
representations inaccurate in any respect. We may publicly disclose your
|
||||||
|
participation in the project, including the fact that you have signed the SCA.
|
||||||
|
|
||||||
|
6. This SCA is governed by the laws of the State of California and applicable
|
||||||
|
U.S. Federal law. Any choice of law rules will not apply.
|
||||||
|
|
||||||
|
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||||
|
mark both statements:
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of myself as an individual and no other person
|
||||||
|
or entity, including my employer, has or will have rights with respect to my
|
||||||
|
contributions.
|
||||||
|
|
||||||
|
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||||
|
actual authority to contractually bind that entity.
|
||||||
|
|
||||||
|
## Contributor Details
|
||||||
|
|
||||||
|
| Field | Entry |
|
||||||
|
|------------------------------- | -------------------------- |
|
||||||
|
| Name | Mihai Gliga |
|
||||||
|
| Company name (if applicable) | |
|
||||||
|
| Title or role (if applicable) | |
|
||||||
|
| Date | September 9, 2019 |
|
||||||
|
| GitHub username | mihaigliga21 |
|
||||||
|
| Website (optional) | |
|
|
@ -5,7 +5,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
|
|
@ -5,7 +5,6 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
@ -462,6 +461,9 @@ def main(
|
||||||
vectors_dir=None,
|
vectors_dir=None,
|
||||||
use_oracle_segments=False,
|
use_oracle_segments=False,
|
||||||
):
|
):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
spacy.util.fix_random_seed()
|
spacy.util.fix_random_seed()
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
|
|
|
@ -3,11 +3,9 @@
|
||||||
"""
|
"""
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
import plac
|
import plac
|
||||||
import tqdm
|
|
||||||
import attr
|
import attr
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import re
|
import re
|
||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
@ -23,7 +21,7 @@ import itertools
|
||||||
import random
|
import random
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
import conll17_ud_eval
|
from bin.ud import conll17_ud_eval
|
||||||
|
|
||||||
import spacy.lang.zh
|
import spacy.lang.zh
|
||||||
import spacy.lang.ja
|
import spacy.lang.ja
|
||||||
|
@ -394,6 +392,9 @@ class TreebankPaths(object):
|
||||||
limit=("Size limit", "option", "n", int),
|
limit=("Size limit", "option", "n", int),
|
||||||
)
|
)
|
||||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
paths = TreebankPaths(ud_dir, corpus)
|
paths = TreebankPaths(ud_dir, corpus)
|
||||||
if not (parses_dir / corpus).exists():
|
if not (parses_dir / corpus).exists():
|
||||||
(parses_dir / corpus).mkdir()
|
(parses_dir / corpus).mkdir()
|
||||||
|
|
|
@ -18,7 +18,6 @@ import random
|
||||||
import spacy
|
import spacy
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
from spacy.util import minibatch, use_gpu, compounding
|
from spacy.util import minibatch, use_gpu, compounding
|
||||||
import tqdm
|
|
||||||
from spacy._ml import Tok2Vec
|
from spacy._ml import Tok2Vec
|
||||||
from spacy.pipeline import TextCategorizer
|
from spacy.pipeline import TextCategorizer
|
||||||
import numpy
|
import numpy
|
||||||
|
@ -107,6 +106,9 @@ def create_pipeline(width, embed_size, vectors_model):
|
||||||
|
|
||||||
|
|
||||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
tensorizer = nlp.create_pipe("tensorizer")
|
tensorizer = nlp.create_pipe("tensorizer")
|
||||||
nlp.add_pipe(tensorizer)
|
nlp.add_pipe(tensorizer)
|
||||||
optimizer = nlp.begin_training()
|
optimizer = nlp.begin_training()
|
||||||
|
@ -120,6 +122,9 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||||
|
|
||||||
|
|
||||||
def train_textcat(nlp, n_texts, n_iter=10):
|
def train_textcat(nlp, n_texts, n_iter=10):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
textcat = nlp.get_pipe("textcat")
|
textcat = nlp.get_pipe("textcat")
|
||||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||||
|
|
|
@ -13,7 +13,6 @@ import numpy
|
||||||
import plac
|
import plac
|
||||||
import spacy
|
import spacy
|
||||||
import tensorflow as tf
|
import tensorflow as tf
|
||||||
import tqdm
|
|
||||||
from tensorflow.contrib.tensorboard.plugins.projector import (
|
from tensorflow.contrib.tensorboard.plugins.projector import (
|
||||||
visualize_embeddings,
|
visualize_embeddings,
|
||||||
ProjectorConfig,
|
ProjectorConfig,
|
||||||
|
@ -36,6 +35,9 @@ from tensorflow.contrib.tensorboard.plugins.projector import (
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
meta_file = "{}.tsv".format(name)
|
meta_file = "{}.tsv".format(name)
|
||||||
out_meta_file = path.join(out_loc, meta_file)
|
out_meta_file = path.join(out_loc, meta_file)
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import plac
|
import plac
|
||||||
import math
|
import math
|
||||||
from tqdm import tqdm
|
|
||||||
import numpy
|
import numpy
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -109,6 +108,9 @@ def open_file(loc):
|
||||||
|
|
||||||
|
|
||||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
if freqs_loc is not None:
|
if freqs_loc is not None:
|
||||||
with msg.loading("Counting frequencies..."):
|
with msg.loading("Counting frequencies..."):
|
||||||
probs, _ = read_freqs(freqs_loc)
|
probs, _ = read_freqs(freqs_loc)
|
||||||
|
@ -186,6 +188,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
||||||
|
|
||||||
|
|
||||||
def read_vectors(vectors_loc):
|
def read_vectors(vectors_loc):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
f = open_file(vectors_loc)
|
f = open_file(vectors_loc)
|
||||||
shape = tuple(int(size) for size in next(f).split())
|
shape = tuple(int(size) for size in next(f).split())
|
||||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||||
|
@ -202,6 +207,9 @@ def read_vectors(vectors_loc):
|
||||||
|
|
||||||
|
|
||||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
counts = PreshCounter()
|
counts = PreshCounter()
|
||||||
total = 0
|
total = 0
|
||||||
with freqs_loc.open() as f:
|
with freqs_loc.open() as f:
|
||||||
|
@ -231,6 +239,9 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||||
|
|
||||||
|
|
||||||
def read_clusters(clusters_loc):
|
def read_clusters(clusters_loc):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
clusters = {}
|
clusters = {}
|
||||||
if ftfy is None:
|
if ftfy is None:
|
||||||
user_warning(Warnings.W004)
|
user_warning(Warnings.W004)
|
||||||
|
|
|
@ -7,7 +7,6 @@ import srsly
|
||||||
import cProfile
|
import cProfile
|
||||||
import pstats
|
import pstats
|
||||||
import sys
|
import sys
|
||||||
import tqdm
|
|
||||||
import itertools
|
import itertools
|
||||||
import thinc.extra.datasets
|
import thinc.extra.datasets
|
||||||
from wasabi import Printer
|
from wasabi import Printer
|
||||||
|
@ -48,6 +47,9 @@ def profile(model, inputs=None, n_texts=10000):
|
||||||
|
|
||||||
|
|
||||||
def parse_texts(nlp, texts):
|
def parse_texts(nlp, texts):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ from __future__ import unicode_literals, division, print_function
|
||||||
import plac
|
import plac
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import tqdm
|
|
||||||
from thinc.neural._classes.model import Model
|
from thinc.neural._classes.model import Model
|
||||||
from timeit import default_timer as timer
|
from timeit import default_timer as timer
|
||||||
import shutil
|
import shutil
|
||||||
|
@ -101,6 +100,10 @@ def train(
|
||||||
JSON format. To convert data from other formats, use the `spacy convert`
|
JSON format. To convert data from other formats, use the `spacy convert`
|
||||||
command.
|
command.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
msg = Printer()
|
msg = Printer()
|
||||||
util.fix_random_seed()
|
util.fix_random_seed()
|
||||||
util.set_env_log(verbose)
|
util.set_env_log(verbose)
|
||||||
|
@ -390,6 +393,9 @@ def _score_for_model(meta):
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
def _create_progress_bar(total):
|
def _create_progress_bar(total):
|
||||||
|
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||||
|
import tqdm
|
||||||
|
|
||||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||||
yield
|
yield
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -452,7 +452,10 @@ class Errors(object):
|
||||||
"Make sure that you're passing in absolute token indices, not "
|
"Make sure that you're passing in absolute token indices, not "
|
||||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||||
"{label}, direction: {dir}")
|
"{label}, direction: {dir}")
|
||||||
E158 = ("Tokenizer special cases are not allowed to modify the text. "
|
E158 = ("Can't add table '{name}' to lookups because it already exists.")
|
||||||
|
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
||||||
|
E160 = ("Can't find language data file: {path}")
|
||||||
|
E161 = ("Tokenizer special cases are not allowed to modify the text. "
|
||||||
"This would map '{chunk}' to '{orth}' given token attributes "
|
"This would map '{chunk}' to '{orth}' given token attributes "
|
||||||
"'{token_attrs}'.")
|
"'{token_attrs}'.")
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||||
|
|
||||||
|
|
||||||
_prefixes = (
|
_prefixes = (
|
||||||
|
@ -27,8 +27,8 @@ _suffixes = (
|
||||||
r"(?<=°[FfCcKk])\.",
|
r"(?<=°[FfCcKk])\.",
|
||||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||||
),
|
),
|
||||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||||
]
|
]
|
||||||
|
|
|
@ -9,6 +9,7 @@ from ..norm_exceptions import BASE_NORMS
|
||||||
from ...language import Language
|
from ...language import Language
|
||||||
from ...attrs import LANG, NORM
|
from ...attrs import LANG, NORM
|
||||||
from ...util import update_exc, add_lookups
|
from ...util import update_exc, add_lookups
|
||||||
|
from .tag_map import TAG_MAP
|
||||||
|
|
||||||
# Lemma data note:
|
# Lemma data note:
|
||||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||||
|
@ -24,6 +25,7 @@ class RomanianDefaults(Language.Defaults):
|
||||||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||||
stop_words = STOP_WORDS
|
stop_words = STOP_WORDS
|
||||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||||
|
tag_map = TAG_MAP
|
||||||
|
|
||||||
|
|
||||||
class Romanian(Language):
|
class Romanian(Language):
|
||||||
|
|
2085
spacy/lang/ro/tag_map.py
Normal file
2085
spacy/lang/ro/tag_map.py
Normal file
File diff suppressed because it is too large
Load Diff
127
spacy/lookups.py
127
spacy/lookups.py
|
@ -1,52 +1,157 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from .util import SimpleFrozenDict
|
import srsly
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from .errors import Errors
|
||||||
|
from .util import SimpleFrozenDict, ensure_path
|
||||||
|
|
||||||
|
|
||||||
class Lookups(object):
|
class Lookups(object):
|
||||||
|
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||||
|
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||||
|
so they can be accessed before the pipeline components are applied (e.g.
|
||||||
|
in the tokenizer and lemmatizer), as well as within the pipeline components
|
||||||
|
via doc.vocab.lookups.
|
||||||
|
|
||||||
|
Important note: At the moment, this class only performs a very basic
|
||||||
|
dictionary lookup. We're planning to replace this with a more efficient
|
||||||
|
implementation. See #3971 for details.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._tables = {}
|
"""Initialize the Lookups object.
|
||||||
|
|
||||||
|
RETURNS (Lookups): The newly created object.
|
||||||
|
"""
|
||||||
|
self._tables = OrderedDict()
|
||||||
|
|
||||||
def __contains__(self, name):
|
def __contains__(self, name):
|
||||||
|
"""Check if the lookups contain a table of a given name. Delegates to
|
||||||
|
Lookups.has_table.
|
||||||
|
|
||||||
|
name (unicode): Name of the table.
|
||||||
|
RETURNS (bool): Whether a table of that name exists.
|
||||||
|
"""
|
||||||
return self.has_table(name)
|
return self.has_table(name)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""RETURNS (int): The number of tables in the lookups."""
|
||||||
|
return len(self._tables)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def tables(self):
|
def tables(self):
|
||||||
|
"""RETURNS (list): Names of all tables in the lookups."""
|
||||||
return list(self._tables.keys())
|
return list(self._tables.keys())
|
||||||
|
|
||||||
def add_table(self, name, data=SimpleFrozenDict()):
|
def add_table(self, name, data=SimpleFrozenDict()):
|
||||||
|
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||||
|
|
||||||
|
name (unicode): Unique name of table.
|
||||||
|
data (dict): Optional data to add to the table.
|
||||||
|
RETURNS (Table): The newly added table.
|
||||||
|
"""
|
||||||
if name in self.tables:
|
if name in self.tables:
|
||||||
raise ValueError("Table '{}' already exists".format(name))
|
raise ValueError(Errors.E158.format(name=name))
|
||||||
table = Table(name=name)
|
table = Table(name=name)
|
||||||
table.update(data)
|
table.update(data)
|
||||||
self._tables[name] = table
|
self._tables[name] = table
|
||||||
return table
|
return table
|
||||||
|
|
||||||
def get_table(self, name):
|
def get_table(self, name):
|
||||||
|
"""Get a table. Raises an error if the table doesn't exist.
|
||||||
|
|
||||||
|
name (unicode): Name of the table.
|
||||||
|
RETURNS (Table): The table.
|
||||||
|
"""
|
||||||
if name not in self._tables:
|
if name not in self._tables:
|
||||||
raise KeyError("Can't find table '{}'".format(name))
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
return self._tables[name]
|
return self._tables[name]
|
||||||
|
|
||||||
|
def remove_table(self, name):
|
||||||
|
"""Remove a table. Raises an error if the table doesn't exist.
|
||||||
|
|
||||||
|
name (unicode): The name to remove.
|
||||||
|
RETURNS (Table): The removed table.
|
||||||
|
"""
|
||||||
|
if name not in self._tables:
|
||||||
|
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||||
|
return self._tables.pop(name)
|
||||||
|
|
||||||
def has_table(self, name):
|
def has_table(self, name):
|
||||||
|
"""Check if the lookups contain a table of a given name.
|
||||||
|
|
||||||
|
name (unicode): Name of the table.
|
||||||
|
RETURNS (bool): Whether a table of that name exists.
|
||||||
|
"""
|
||||||
return name in self._tables
|
return name in self._tables
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
raise NotImplementedError
|
"""Serialize the lookups to a bytestring.
|
||||||
|
|
||||||
|
exclude (list): String names of serialization fields to exclude.
|
||||||
|
RETURNS (bytes): The serialized Lookups.
|
||||||
|
"""
|
||||||
|
return srsly.msgpack_dumps(self._tables)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||||
raise NotImplementedError
|
"""Load the lookups from a bytestring.
|
||||||
|
|
||||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
exclude (list): String names of serialization fields to exclude.
|
||||||
raise NotImplementedError
|
RETURNS (bytes): The loaded Lookups.
|
||||||
|
"""
|
||||||
|
self._tables = OrderedDict()
|
||||||
|
msg = srsly.msgpack_loads(bytes_data)
|
||||||
|
for key, value in msg.items():
|
||||||
|
self._tables[key] = Table.from_dict(value)
|
||||||
|
return self
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
raise NotImplementedError
|
"""Save the lookups to a directory as lookups.bin.
|
||||||
|
|
||||||
|
path (unicode / Path): The file path.
|
||||||
|
"""
|
||||||
|
if len(self._tables):
|
||||||
|
path = ensure_path(path)
|
||||||
|
filepath = path / "lookups.bin"
|
||||||
|
with filepath.open("wb") as file_:
|
||||||
|
file_.write(self.to_bytes())
|
||||||
|
|
||||||
|
def from_disk(self, path, **kwargs):
|
||||||
|
"""Load lookups from a directory containing a lookups.bin.
|
||||||
|
|
||||||
|
path (unicode / Path): The file path.
|
||||||
|
RETURNS (Lookups): The loaded lookups.
|
||||||
|
"""
|
||||||
|
path = ensure_path(path)
|
||||||
|
filepath = path / "lookups.bin"
|
||||||
|
if filepath.exists():
|
||||||
|
with filepath.open("rb") as file_:
|
||||||
|
data = file_.read()
|
||||||
|
return self.from_bytes(data)
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class Table(dict):
|
class Table(OrderedDict):
|
||||||
|
"""A table in the lookups. Subclass of builtin dict that implements a
|
||||||
|
slightly more consistent and unified API.
|
||||||
|
"""
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, data, name=None):
|
||||||
|
self = cls(name=name)
|
||||||
|
self.update(data)
|
||||||
|
return self
|
||||||
|
|
||||||
def __init__(self, name=None):
|
def __init__(self, name=None):
|
||||||
|
"""Initialize a new table.
|
||||||
|
|
||||||
|
name (unicode): Optional table name for reference.
|
||||||
|
RETURNS (Table): The newly created object.
|
||||||
|
"""
|
||||||
|
OrderedDict.__init__(self)
|
||||||
self.name = name
|
self.name = name
|
||||||
|
|
||||||
def set(self, key, value):
|
def set(self, key, value):
|
||||||
|
"""Set new key/value pair. Same as table[key] = value."""
|
||||||
self[key] = value
|
self[key] = value
|
||||||
|
|
|
@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
||||||
assert tokens[6].text == "Puddleton"
|
assert tokens[6].text == "Puddleton"
|
||||||
assert tokens[7].text == "?"
|
assert tokens[7].text == "?"
|
||||||
assert tokens[8].text == "\u2014"
|
assert tokens[8].text == "\u2014"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
|
||||||
|
def test_final_period(en_tokenizer, text, length):
|
||||||
|
tokens = en_tokenizer(text)
|
||||||
|
assert len(tokens) == length
|
||||||
|
|
|
@ -94,6 +94,9 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
||||||
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
# I can't get this to work with the lookup tables for 3.5 :(. Something to do
|
||||||
|
# with the dict ordering
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
||||||
tensorizer = Tensorizer(en_vocab)
|
tensorizer = Tensorizer(en_vocab)
|
||||||
tensorizer.model = tensorizer.Model()
|
tensorizer.model = tensorizer.Model()
|
||||||
|
@ -112,6 +115,9 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||||
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
# I can't get this to work with the lookup tables for 3.5 :(. Something to do
|
||||||
|
# with the dict ordering
|
||||||
|
@pytest.mark.xfail
|
||||||
def test_serialize_textcat_empty(en_vocab):
|
def test_serialize_textcat_empty(en_vocab):
|
||||||
# See issue #1105
|
# See issue #1105
|
||||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||||
|
|
|
@ -12,12 +12,14 @@ test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
||||||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail
|
||||||
@pytest.mark.parametrize("text", ["rat"])
|
@pytest.mark.parametrize("text", ["rat"])
|
||||||
def test_serialize_vocab(en_vocab, text):
|
def test_serialize_vocab(en_vocab, text):
|
||||||
text_hash = en_vocab.strings.add(text)
|
text_hash = en_vocab.strings.add(text)
|
||||||
vocab_bytes = en_vocab.to_bytes()
|
vocab_bytes = en_vocab.to_bytes(exclude=["lookups"])
|
||||||
new_vocab = Vocab().from_bytes(vocab_bytes)
|
new_vocab = Vocab().from_bytes(vocab_bytes)
|
||||||
assert new_vocab.strings[text_hash] == text
|
assert new_vocab.strings[text_hash] == text
|
||||||
|
assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||||
|
|
|
@ -3,6 +3,9 @@ from __future__ import unicode_literals
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from spacy.lookups import Lookups
|
from spacy.lookups import Lookups
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
|
||||||
|
from ..util import make_tempdir
|
||||||
|
|
||||||
|
|
||||||
def test_lookups_api():
|
def test_lookups_api():
|
||||||
|
@ -10,6 +13,7 @@ def test_lookups_api():
|
||||||
data = {"foo": "bar", "hello": "world"}
|
data = {"foo": "bar", "hello": "world"}
|
||||||
lookups = Lookups()
|
lookups = Lookups()
|
||||||
lookups.add_table(table_name, data)
|
lookups.add_table(table_name, data)
|
||||||
|
assert len(lookups) == 1
|
||||||
assert table_name in lookups
|
assert table_name in lookups
|
||||||
assert lookups.has_table(table_name)
|
assert lookups.has_table(table_name)
|
||||||
table = lookups.get_table(table_name)
|
table = lookups.get_table(table_name)
|
||||||
|
@ -22,5 +26,89 @@ def test_lookups_api():
|
||||||
assert len(table) == 3
|
assert len(table) == 3
|
||||||
with pytest.raises(KeyError):
|
with pytest.raises(KeyError):
|
||||||
lookups.get_table("xyz")
|
lookups.get_table("xyz")
|
||||||
# with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
# lookups.add_table(table_name)
|
lookups.add_table(table_name)
|
||||||
|
table = lookups.remove_table(table_name)
|
||||||
|
assert table.name == table_name
|
||||||
|
assert len(lookups) == 0
|
||||||
|
assert table_name not in lookups
|
||||||
|
with pytest.raises(KeyError):
|
||||||
|
lookups.get_table(table_name)
|
||||||
|
|
||||||
|
|
||||||
|
# This fails on Python 3.5
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_lookups_to_from_bytes():
|
||||||
|
lookups = Lookups()
|
||||||
|
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
||||||
|
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
||||||
|
lookups_bytes = lookups.to_bytes()
|
||||||
|
new_lookups = Lookups()
|
||||||
|
new_lookups.from_bytes(lookups_bytes)
|
||||||
|
assert len(new_lookups) == 2
|
||||||
|
assert "table1" in new_lookups
|
||||||
|
assert "table2" in new_lookups
|
||||||
|
table1 = new_lookups.get_table("table1")
|
||||||
|
assert len(table1) == 2
|
||||||
|
assert table1.get("foo") == "bar"
|
||||||
|
table2 = new_lookups.get_table("table2")
|
||||||
|
assert len(table2) == 3
|
||||||
|
assert table2.get("b") == 2
|
||||||
|
assert new_lookups.to_bytes() == lookups_bytes
|
||||||
|
|
||||||
|
# This fails on Python 3.5
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_lookups_to_from_disk():
|
||||||
|
lookups = Lookups()
|
||||||
|
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
||||||
|
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
lookups.to_disk(tmpdir)
|
||||||
|
new_lookups = Lookups()
|
||||||
|
new_lookups.from_disk(tmpdir)
|
||||||
|
assert len(new_lookups) == 2
|
||||||
|
assert "table1" in new_lookups
|
||||||
|
assert "table2" in new_lookups
|
||||||
|
table1 = new_lookups.get_table("table1")
|
||||||
|
assert len(table1) == 2
|
||||||
|
assert table1.get("foo") == "bar"
|
||||||
|
table2 = new_lookups.get_table("table2")
|
||||||
|
assert len(table2) == 3
|
||||||
|
assert table2.get("b") == 2
|
||||||
|
|
||||||
|
# This fails on Python 3.5
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_lookups_to_from_bytes_via_vocab():
|
||||||
|
table_name = "test"
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||||
|
assert len(vocab.lookups) == 1
|
||||||
|
assert table_name in vocab.lookups
|
||||||
|
vocab_bytes = vocab.to_bytes()
|
||||||
|
new_vocab = Vocab()
|
||||||
|
new_vocab.from_bytes(vocab_bytes)
|
||||||
|
assert len(new_vocab.lookups) == 1
|
||||||
|
assert table_name in new_vocab.lookups
|
||||||
|
table = new_vocab.lookups.get_table(table_name)
|
||||||
|
assert len(table) == 2
|
||||||
|
assert table.get("hello") == "world"
|
||||||
|
assert new_vocab.to_bytes() == vocab_bytes
|
||||||
|
|
||||||
|
|
||||||
|
# This fails on Python 3.5
|
||||||
|
@pytest.mark.xfail
|
||||||
|
def test_lookups_to_from_disk_via_vocab():
|
||||||
|
table_name = "test"
|
||||||
|
vocab = Vocab()
|
||||||
|
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||||
|
assert len(vocab.lookups) == 1
|
||||||
|
assert table_name in vocab.lookups
|
||||||
|
with make_tempdir() as tmpdir:
|
||||||
|
vocab.to_disk(tmpdir)
|
||||||
|
new_vocab = Vocab()
|
||||||
|
new_vocab.from_disk(tmpdir)
|
||||||
|
assert len(new_vocab.lookups) == 1
|
||||||
|
assert table_name in new_vocab.lookups
|
||||||
|
table = new_vocab.lookups.get_table(table_name)
|
||||||
|
assert len(table) == 2
|
||||||
|
assert table.get("hello") == "world"
|
||||||
|
|
|
@ -131,8 +131,7 @@ def load_language_data(path):
|
||||||
path = path.with_suffix(path.suffix + ".gz")
|
path = path.with_suffix(path.suffix + ".gz")
|
||||||
if path.exists():
|
if path.exists():
|
||||||
return srsly.read_gzip_json(path)
|
return srsly.read_gzip_json(path)
|
||||||
# TODO: move to spacy.errors
|
raise ValueError(Errors.E160.format(path=path2str(path)))
|
||||||
raise ValueError("Can't find language data file: {}".format(path2str(path)))
|
|
||||||
|
|
||||||
|
|
||||||
def get_module_path(module):
|
def get_module_path(module):
|
||||||
|
@ -458,6 +457,14 @@ def expand_exc(excs, search, replace):
|
||||||
|
|
||||||
|
|
||||||
def get_lemma_tables(lookups):
|
def get_lemma_tables(lookups):
|
||||||
|
"""Load lemmatizer data from lookups table. Mostly used via
|
||||||
|
Language.Defaults.create_lemmatizer, but available as helper so it can be
|
||||||
|
reused in language classes that implement custom lemmatizers.
|
||||||
|
|
||||||
|
lookups (Lookups): The lookups table.
|
||||||
|
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
||||||
|
tuple that can be used to initialize a Lemmatizer.
|
||||||
|
"""
|
||||||
lemma_rules = {}
|
lemma_rules = {}
|
||||||
lemma_index = {}
|
lemma_index = {}
|
||||||
lemma_exc = {}
|
lemma_exc = {}
|
||||||
|
|
|
@ -43,6 +43,7 @@ cdef class Vocab:
|
||||||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||||
strings (StringStore): StringStore that maps strings to integers, and
|
strings (StringStore): StringStore that maps strings to integers, and
|
||||||
vice versa.
|
vice versa.
|
||||||
|
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||||
RETURNS (Vocab): The newly constructed object.
|
RETURNS (Vocab): The newly constructed object.
|
||||||
"""
|
"""
|
||||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||||
|
@ -433,6 +434,8 @@ cdef class Vocab:
|
||||||
file_.write(self.lexemes_to_bytes())
|
file_.write(self.lexemes_to_bytes())
|
||||||
if "vectors" not in "exclude" and self.vectors is not None:
|
if "vectors" not in "exclude" and self.vectors is not None:
|
||||||
self.vectors.to_disk(path)
|
self.vectors.to_disk(path)
|
||||||
|
if "lookups" not in "exclude" and self.lookups is not None:
|
||||||
|
self.lookups.to_disk(path)
|
||||||
|
|
||||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||||
"""Loads state from a directory. Modifies the object in place and
|
"""Loads state from a directory. Modifies the object in place and
|
||||||
|
@ -457,6 +460,8 @@ cdef class Vocab:
|
||||||
self.vectors.from_disk(path, exclude=["strings"])
|
self.vectors.from_disk(path, exclude=["strings"])
|
||||||
if self.vectors.name is not None:
|
if self.vectors.name is not None:
|
||||||
link_vectors_to_models(self)
|
link_vectors_to_models(self)
|
||||||
|
if "lookups" not in exclude:
|
||||||
|
self.lookups.from_disk(path)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||||
|
@ -476,7 +481,8 @@ cdef class Vocab:
|
||||||
getters = OrderedDict((
|
getters = OrderedDict((
|
||||||
("strings", lambda: self.strings.to_bytes()),
|
("strings", lambda: self.strings.to_bytes()),
|
||||||
("lexemes", lambda: self.lexemes_to_bytes()),
|
("lexemes", lambda: self.lexemes_to_bytes()),
|
||||||
("vectors", deserialize_vectors)
|
("vectors", deserialize_vectors),
|
||||||
|
("lookups", lambda: self.lookups.to_bytes())
|
||||||
))
|
))
|
||||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||||
return util.to_bytes(getters, exclude)
|
return util.to_bytes(getters, exclude)
|
||||||
|
@ -499,7 +505,8 @@ cdef class Vocab:
|
||||||
setters = OrderedDict((
|
setters = OrderedDict((
|
||||||
("strings", lambda b: self.strings.from_bytes(b)),
|
("strings", lambda b: self.strings.from_bytes(b)),
|
||||||
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
||||||
("vectors", lambda b: serialize_vectors(b))
|
("vectors", lambda b: serialize_vectors(b)),
|
||||||
|
("lookups", lambda b: self.lookups.from_bytes(b))
|
||||||
))
|
))
|
||||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||||
util.from_bytes(bytes_data, setters, exclude)
|
util.from_bytes(bytes_data, setters, exclude)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user