mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Update from master
This commit is contained in:
commit
7b858ba606
4
.flake8
4
.flake8
|
@ -6,9 +6,5 @@ exclude =
|
|||
.env,
|
||||
.git,
|
||||
__pycache__,
|
||||
lemmatizer.py,
|
||||
lookup.py,
|
||||
_tokenizer_exceptions_list.py,
|
||||
spacy/lang/fr/lemmatizer,
|
||||
spacy/lang/nb/lemmatizer
|
||||
spacy/__init__.py
|
||||
|
|
106
.github/contributors/mihaigliga21.md
vendored
Normal file
106
.github/contributors/mihaigliga21.md
vendored
Normal file
|
@ -0,0 +1,106 @@
|
|||
# spaCy contributor agreement
|
||||
|
||||
This spaCy Contributor Agreement (**"SCA"**) is based on the
|
||||
[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
|
||||
The SCA applies to any contribution that you make to any product or project
|
||||
managed by us (the **"project"**), and sets out the intellectual property rights
|
||||
you grant to us in the contributed materials. The term **"us"** shall mean
|
||||
[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
|
||||
**"you"** shall mean the person or entity identified below.
|
||||
|
||||
If you agree to be bound by these terms, fill in the information requested
|
||||
below and include the filled-in version with your first pull request, under the
|
||||
folder [`.github/contributors/`](/.github/contributors/). The name of the file
|
||||
should be your GitHub username, with the extension `.md`. For example, the user
|
||||
example_user would create the file `.github/contributors/example_user.md`.
|
||||
|
||||
Read this agreement carefully before signing. These terms and conditions
|
||||
constitute a binding legal agreement.
|
||||
|
||||
## Contributor Agreement
|
||||
|
||||
1. The term "contribution" or "contributed materials" means any source code,
|
||||
object code, patch, tool, sample, graphic, specification, manual,
|
||||
documentation, or any other material posted or submitted by you to the project.
|
||||
|
||||
2. With respect to any worldwide copyrights, or copyright applications and
|
||||
registrations, in your contribution:
|
||||
|
||||
* you hereby assign to us joint ownership, and to the extent that such
|
||||
assignment is or becomes invalid, ineffective or unenforceable, you hereby
|
||||
grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
|
||||
royalty-free, unrestricted license to exercise all rights under those
|
||||
copyrights. This includes, at our option, the right to sublicense these same
|
||||
rights to third parties through multiple levels of sublicensees or other
|
||||
licensing arrangements;
|
||||
|
||||
* you agree that each of us can do all things in relation to your
|
||||
contribution as if each of us were the sole owners, and if one of us makes
|
||||
a derivative work of your contribution, the one who makes the derivative
|
||||
work (or has it made will be the sole owner of that derivative work;
|
||||
|
||||
* you agree that you will not assert any moral rights in your contribution
|
||||
against us, our licensees or transferees;
|
||||
|
||||
* you agree that we may register a copyright in your contribution and
|
||||
exercise all ownership rights associated with it; and
|
||||
|
||||
* you agree that neither of us has any duty to consult with, obtain the
|
||||
consent of, pay or render an accounting to the other for any use or
|
||||
distribution of your contribution.
|
||||
|
||||
3. With respect to any patents you own, or that you can license without payment
|
||||
to any third party, you hereby grant to us a perpetual, irrevocable,
|
||||
non-exclusive, worldwide, no-charge, royalty-free license to:
|
||||
|
||||
* make, have made, use, sell, offer to sell, import, and otherwise transfer
|
||||
your contribution in whole or in part, alone or in combination with or
|
||||
included in any product, work or materials arising out of the project to
|
||||
which your contribution was submitted, and
|
||||
|
||||
* at our option, to sublicense these same rights to third parties through
|
||||
multiple levels of sublicensees or other licensing arrangements.
|
||||
|
||||
4. Except as set out above, you keep all right, title, and interest in your
|
||||
contribution. The rights that you grant to us under these terms are effective
|
||||
on the date you first submitted a contribution to us, even if your submission
|
||||
took place before the date you sign these terms.
|
||||
|
||||
5. You covenant, represent, warrant and agree that:
|
||||
|
||||
* Each contribution that you submit is and shall be an original work of
|
||||
authorship and you can legally grant the rights set out in this SCA;
|
||||
|
||||
* to the best of your knowledge, each contribution will not violate any
|
||||
third party's copyrights, trademarks, patents, or other intellectual
|
||||
property rights; and
|
||||
|
||||
* each contribution shall be in compliance with U.S. export control laws and
|
||||
other applicable export and import laws. You agree to notify us if you
|
||||
become aware of any circumstance which would make any of the foregoing
|
||||
representations inaccurate in any respect. We may publicly disclose your
|
||||
participation in the project, including the fact that you have signed the SCA.
|
||||
|
||||
6. This SCA is governed by the laws of the State of California and applicable
|
||||
U.S. Federal law. Any choice of law rules will not apply.
|
||||
|
||||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
* [x] I am signing on behalf of my employer or a legal entity and I have the
|
||||
actual authority to contractually bind that entity.
|
||||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------------- |
|
||||
| Name | Mihai Gliga |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | September 9, 2019 |
|
||||
| GitHub username | mihaigliga21 |
|
||||
| Website (optional) | |
|
|
@ -5,7 +5,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
import plac
|
||||
import tqdm
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
@ -486,6 +485,9 @@ def main(
|
|||
vectors_dir=None,
|
||||
use_oracle_segments=False,
|
||||
):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
spacy.util.fix_random_seed()
|
||||
lang.zh.Chinese.Defaults.use_jieba = False
|
||||
lang.ja.Japanese.Defaults.use_janome = False
|
||||
|
|
|
@ -3,11 +3,9 @@
|
|||
"""
|
||||
from __future__ import unicode_literals
|
||||
import plac
|
||||
import tqdm
|
||||
import attr
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
|
||||
import spacy
|
||||
|
@ -23,7 +21,7 @@ import itertools
|
|||
import random
|
||||
import numpy.random
|
||||
|
||||
import conll17_ud_eval
|
||||
from bin.ud import conll17_ud_eval
|
||||
|
||||
import spacy.lang.zh
|
||||
import spacy.lang.ja
|
||||
|
@ -394,6 +392,9 @@ class TreebankPaths(object):
|
|||
limit=("Size limit", "option", "n", int),
|
||||
)
|
||||
def main(ud_dir, parses_dir, config, corpus, limit=0):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
paths = TreebankPaths(ud_dir, corpus)
|
||||
if not (parses_dir / corpus).exists():
|
||||
(parses_dir / corpus).mkdir()
|
||||
|
|
|
@ -18,7 +18,6 @@ import random
|
|||
import spacy
|
||||
import thinc.extra.datasets
|
||||
from spacy.util import minibatch, use_gpu, compounding
|
||||
import tqdm
|
||||
from spacy._ml import Tok2Vec
|
||||
from spacy.pipeline import TextCategorizer
|
||||
import numpy
|
||||
|
@ -107,6 +106,9 @@ def create_pipeline(width, embed_size, vectors_model):
|
|||
|
||||
|
||||
def train_tensorizer(nlp, texts, dropout, n_iter):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
tensorizer = nlp.create_pipe("tensorizer")
|
||||
nlp.add_pipe(tensorizer)
|
||||
optimizer = nlp.begin_training()
|
||||
|
@ -120,6 +122,9 @@ def train_tensorizer(nlp, texts, dropout, n_iter):
|
|||
|
||||
|
||||
def train_textcat(nlp, n_texts, n_iter=10):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
textcat = nlp.get_pipe("textcat")
|
||||
tok2vec_weights = textcat.model.tok2vec.to_bytes()
|
||||
(train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
|
||||
|
|
|
@ -13,7 +13,6 @@ import numpy
|
|||
import plac
|
||||
import spacy
|
||||
import tensorflow as tf
|
||||
import tqdm
|
||||
from tensorflow.contrib.tensorboard.plugins.projector import (
|
||||
visualize_embeddings,
|
||||
ProjectorConfig,
|
||||
|
@ -36,6 +35,9 @@ from tensorflow.contrib.tensorboard.plugins.projector import (
|
|||
),
|
||||
)
|
||||
def main(vectors_loc, out_loc, name="spaCy_vectors"):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
meta_file = "{}.tsv".format(name)
|
||||
out_meta_file = path.join(out_loc, meta_file)
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@ from __future__ import unicode_literals
|
|||
|
||||
import plac
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
import numpy
|
||||
from ast import literal_eval
|
||||
from pathlib import Path
|
||||
|
@ -109,6 +108,9 @@ def open_file(loc):
|
|||
|
||||
|
||||
def read_attrs_from_deprecated(freqs_loc, clusters_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
if freqs_loc is not None:
|
||||
with msg.loading("Counting frequencies..."):
|
||||
probs, _ = read_freqs(freqs_loc)
|
||||
|
@ -186,6 +188,9 @@ def add_vectors(nlp, vectors_loc, prune_vectors):
|
|||
|
||||
|
||||
def read_vectors(vectors_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
f = open_file(vectors_loc)
|
||||
shape = tuple(int(size) for size in next(f).split())
|
||||
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
||||
|
@ -202,6 +207,9 @@ def read_vectors(vectors_loc):
|
|||
|
||||
|
||||
def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
counts = PreshCounter()
|
||||
total = 0
|
||||
with freqs_loc.open() as f:
|
||||
|
@ -231,6 +239,9 @@ def read_freqs(freqs_loc, max_length=100, min_doc_freq=5, min_freq=50):
|
|||
|
||||
|
||||
def read_clusters(clusters_loc):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
from tqdm import tqdm
|
||||
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
user_warning(Warnings.W004)
|
||||
|
|
|
@ -7,7 +7,6 @@ import srsly
|
|||
import cProfile
|
||||
import pstats
|
||||
import sys
|
||||
import tqdm
|
||||
import itertools
|
||||
import thinc.extra.datasets
|
||||
from wasabi import Printer
|
||||
|
@ -48,6 +47,9 @@ def profile(model, inputs=None, n_texts=10000):
|
|||
|
||||
|
||||
def parse_texts(nlp, texts):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
for doc in nlp.pipe(tqdm.tqdm(texts), batch_size=16):
|
||||
pass
|
||||
|
||||
|
|
|
@ -4,7 +4,6 @@ from __future__ import unicode_literals, division, print_function
|
|||
import plac
|
||||
import os
|
||||
from pathlib import Path
|
||||
import tqdm
|
||||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
import shutil
|
||||
|
@ -103,6 +102,10 @@ def train(
|
|||
JSON format. To convert data from other formats, use the `spacy convert`
|
||||
command.
|
||||
"""
|
||||
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
msg = Printer()
|
||||
util.fix_random_seed()
|
||||
util.set_env_log(verbose)
|
||||
|
@ -392,6 +395,9 @@ def _score_for_model(meta):
|
|||
|
||||
@contextlib.contextmanager
|
||||
def _create_progress_bar(total):
|
||||
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
|
||||
import tqdm
|
||||
|
||||
if int(os.environ.get("LOG_FRIENDLY", 0)):
|
||||
yield
|
||||
else:
|
||||
|
|
|
@ -452,6 +452,9 @@ class Errors(object):
|
|||
"Make sure that you're passing in absolute token indices, not "
|
||||
"relative token offsets.\nstart: {start}, end: {end}, label: "
|
||||
"{label}, direction: {dir}")
|
||||
E158 = ("Can't add table '{name}' to lookups because it already exists.")
|
||||
E159 = ("Can't find table '{name}' in lookups. Available tables: {tables}")
|
||||
E160 = ("Can't find language data file: {path}")
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
|
|
|
@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||
|
||||
from .char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_CURRENCY
|
||||
from .char_classes import LIST_ICONS, HYPHENS, CURRENCY, UNITS
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
|
||||
from .char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
|
||||
|
||||
|
||||
_prefixes = (
|
||||
|
@ -27,8 +27,8 @@ _suffixes = (
|
|||
r"(?<=°[FfCcKk])\.",
|
||||
r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
|
||||
r"(?<=[0-9])(?:{u})".format(u=UNITS),
|
||||
r"(?<=[0-9{al}{e}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES
|
||||
r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
|
||||
al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
|
||||
),
|
||||
r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
|
||||
]
|
||||
|
|
|
@ -9,6 +9,7 @@ from ..norm_exceptions import BASE_NORMS
|
|||
from ...language import Language
|
||||
from ...attrs import LANG, NORM
|
||||
from ...util import update_exc, add_lookups
|
||||
from .tag_map import TAG_MAP
|
||||
|
||||
# Lemma data note:
|
||||
# Original pairs downloaded from http://www.lexiconista.com/datasets/lemmatization/
|
||||
|
@ -24,6 +25,7 @@ class RomanianDefaults(Language.Defaults):
|
|||
tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
|
||||
stop_words = STOP_WORDS
|
||||
resources = {"lemma_lookup": "lemma_lookup.json"}
|
||||
tag_map = TAG_MAP
|
||||
|
||||
|
||||
class Romanian(Language):
|
||||
|
|
2085
spacy/lang/ro/tag_map.py
Normal file
2085
spacy/lang/ro/tag_map.py
Normal file
File diff suppressed because it is too large
Load Diff
|
@ -24,7 +24,7 @@ class UkrainianDefaults(Language.Defaults):
|
|||
stop_words = STOP_WORDS
|
||||
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None):
|
||||
def create_lemmatizer(cls, nlp=None, **kwargs):
|
||||
return UkrainianLemmatizer()
|
||||
|
||||
|
||||
|
|
127
spacy/lookups.py
127
spacy/lookups.py
|
@ -1,52 +1,157 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .util import SimpleFrozenDict
|
||||
import srsly
|
||||
from collections import OrderedDict
|
||||
|
||||
from .errors import Errors
|
||||
from .util import SimpleFrozenDict, ensure_path
|
||||
|
||||
|
||||
class Lookups(object):
|
||||
"""Container for large lookup tables and dictionaries, e.g. lemmatization
|
||||
data or tokenizer exception lists. Lookups are available via vocab.lookups,
|
||||
so they can be accessed before the pipeline components are applied (e.g.
|
||||
in the tokenizer and lemmatizer), as well as within the pipeline components
|
||||
via doc.vocab.lookups.
|
||||
|
||||
Important note: At the moment, this class only performs a very basic
|
||||
dictionary lookup. We're planning to replace this with a more efficient
|
||||
implementation. See #3971 for details.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self._tables = {}
|
||||
"""Initialize the Lookups object.
|
||||
|
||||
RETURNS (Lookups): The newly created object.
|
||||
"""
|
||||
self._tables = OrderedDict()
|
||||
|
||||
def __contains__(self, name):
|
||||
"""Check if the lookups contain a table of a given name. Delegates to
|
||||
Lookups.has_table.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
"""
|
||||
return self.has_table(name)
|
||||
|
||||
def __len__(self):
|
||||
"""RETURNS (int): The number of tables in the lookups."""
|
||||
return len(self._tables)
|
||||
|
||||
@property
|
||||
def tables(self):
|
||||
"""RETURNS (list): Names of all tables in the lookups."""
|
||||
return list(self._tables.keys())
|
||||
|
||||
def add_table(self, name, data=SimpleFrozenDict()):
|
||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||
|
||||
name (unicode): Unique name of table.
|
||||
data (dict): Optional data to add to the table.
|
||||
RETURNS (Table): The newly added table.
|
||||
"""
|
||||
if name in self.tables:
|
||||
raise ValueError("Table '{}' already exists".format(name))
|
||||
raise ValueError(Errors.E158.format(name=name))
|
||||
table = Table(name=name)
|
||||
table.update(data)
|
||||
self._tables[name] = table
|
||||
return table
|
||||
|
||||
def get_table(self, name):
|
||||
"""Get a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
RETURNS (Table): The table.
|
||||
"""
|
||||
if name not in self._tables:
|
||||
raise KeyError("Can't find table '{}'".format(name))
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return self._tables[name]
|
||||
|
||||
def remove_table(self, name):
|
||||
"""Remove a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (unicode): The name to remove.
|
||||
RETURNS (Table): The removed table.
|
||||
"""
|
||||
if name not in self._tables:
|
||||
raise KeyError(Errors.E159.format(name=name, tables=self.tables))
|
||||
return self._tables.pop(name)
|
||||
|
||||
def has_table(self, name):
|
||||
"""Check if the lookups contain a table of a given name.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
"""
|
||||
return name in self._tables
|
||||
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
raise NotImplementedError
|
||||
"""Serialize the lookups to a bytestring.
|
||||
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The serialized Lookups.
|
||||
"""
|
||||
return srsly.msgpack_dumps(self._tables)
|
||||
|
||||
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs):
|
||||
raise NotImplementedError
|
||||
"""Load the lookups from a bytestring.
|
||||
|
||||
def to_disk(self, path, exclude=tuple(), **kwargs):
|
||||
raise NotImplementedError
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (bytes): The loaded Lookups.
|
||||
"""
|
||||
self._tables = OrderedDict()
|
||||
msg = srsly.msgpack_loads(bytes_data)
|
||||
for key, value in msg.items():
|
||||
self._tables[key] = Table.from_dict(value)
|
||||
return self
|
||||
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
raise NotImplementedError
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the lookups to a directory as lookups.bin.
|
||||
|
||||
path (unicode / Path): The file path.
|
||||
"""
|
||||
if len(self._tables):
|
||||
path = ensure_path(path)
|
||||
filepath = path / "lookups.bin"
|
||||
with filepath.open("wb") as file_:
|
||||
file_.write(self.to_bytes())
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Load lookups from a directory containing a lookups.bin.
|
||||
|
||||
path (unicode / Path): The file path.
|
||||
RETURNS (Lookups): The loaded lookups.
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
filepath = path / "lookups.bin"
|
||||
if filepath.exists():
|
||||
with filepath.open("rb") as file_:
|
||||
data = file_.read()
|
||||
return self.from_bytes(data)
|
||||
return self
|
||||
|
||||
|
||||
class Table(dict):
|
||||
class Table(OrderedDict):
|
||||
"""A table in the lookups. Subclass of builtin dict that implements a
|
||||
slightly more consistent and unified API.
|
||||
"""
|
||||
@classmethod
|
||||
def from_dict(cls, data, name=None):
|
||||
self = cls(name=name)
|
||||
self.update(data)
|
||||
return self
|
||||
|
||||
def __init__(self, name=None):
|
||||
"""Initialize a new table.
|
||||
|
||||
name (unicode): Optional table name for reference.
|
||||
RETURNS (Table): The newly created object.
|
||||
"""
|
||||
OrderedDict.__init__(self)
|
||||
self.name = name
|
||||
|
||||
def set(self, key, value):
|
||||
"""Set new key/value pair. Same as table[key] = value."""
|
||||
self[key] = value
|
||||
|
|
|
@ -133,3 +133,9 @@ def test_en_tokenizer_splits_em_dash_infix(en_tokenizer):
|
|||
assert tokens[6].text == "Puddleton"
|
||||
assert tokens[7].text == "?"
|
||||
assert tokens[8].text == "\u2014"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("text,length", [("_MATH_", 3), ("_MATH_.", 4)])
|
||||
def test_final_period(en_tokenizer, text, length):
|
||||
tokens = en_tokenizer(text)
|
||||
assert len(tokens) == length
|
||||
|
|
|
@ -13,7 +13,6 @@ from spacy.lemmatizer import Lemmatizer
|
|||
from spacy.symbols import ORTH, LEMMA, POS, VERB, VerbForm_part
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
def test_issue1061():
|
||||
'''Test special-case works after tokenizing. Was caching problem.'''
|
||||
text = 'I like _MATH_ even _MATH_ when _MATH_, except when _MATH_ is _MATH_! but not _MATH_.'
|
||||
|
|
|
@ -41,8 +41,8 @@ def test_serialize_parser_roundtrip_bytes(en_vocab, Parser):
|
|||
parser.model, _ = parser.Model(10)
|
||||
new_parser = Parser(en_vocab)
|
||||
new_parser.model, _ = new_parser.Model(10)
|
||||
new_parser = new_parser.from_bytes(parser.to_bytes())
|
||||
assert new_parser.to_bytes() == parser.to_bytes()
|
||||
new_parser = new_parser.from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||
assert new_parser.to_bytes(exclude=["vocab"]) == parser.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Parser", test_parsers)
|
||||
|
@ -55,8 +55,8 @@ def test_serialize_parser_roundtrip_disk(en_vocab, Parser):
|
|||
parser_d = Parser(en_vocab)
|
||||
parser_d.model, _ = parser_d.Model(0)
|
||||
parser_d = parser_d.from_disk(file_path)
|
||||
parser_bytes = parser.to_bytes(exclude=["model"])
|
||||
parser_d_bytes = parser_d.to_bytes(exclude=["model"])
|
||||
parser_bytes = parser.to_bytes(exclude=["model", "vocab"])
|
||||
parser_d_bytes = parser_d.to_bytes(exclude=["model", "vocab"])
|
||||
assert parser_bytes == parser_d_bytes
|
||||
|
||||
|
||||
|
@ -64,7 +64,7 @@ def test_to_from_bytes(parser, blank_parser):
|
|||
assert parser.model is not True
|
||||
assert blank_parser.model is True
|
||||
assert blank_parser.moves.n_moves != parser.moves.n_moves
|
||||
bytes_data = parser.to_bytes()
|
||||
bytes_data = parser.to_bytes(exclude=["vocab"])
|
||||
blank_parser.from_bytes(bytes_data)
|
||||
assert blank_parser.model is not True
|
||||
assert blank_parser.moves.n_moves == parser.moves.n_moves
|
||||
|
@ -97,9 +97,9 @@ def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
|
|||
def test_serialize_tensorizer_roundtrip_bytes(en_vocab):
|
||||
tensorizer = Tensorizer(en_vocab)
|
||||
tensorizer.model = tensorizer.Model()
|
||||
tensorizer_b = tensorizer.to_bytes()
|
||||
tensorizer_b = tensorizer.to_bytes(exclude=["vocab"])
|
||||
new_tensorizer = Tensorizer(en_vocab).from_bytes(tensorizer_b)
|
||||
assert new_tensorizer.to_bytes() == tensorizer_b
|
||||
assert new_tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_b
|
||||
|
||||
|
||||
def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
||||
|
@ -109,13 +109,15 @@ def test_serialize_tensorizer_roundtrip_disk(en_vocab):
|
|||
file_path = d / "tensorizer"
|
||||
tensorizer.to_disk(file_path)
|
||||
tensorizer_d = Tensorizer(en_vocab).from_disk(file_path)
|
||||
assert tensorizer.to_bytes() == tensorizer_d.to_bytes()
|
||||
assert tensorizer.to_bytes(exclude=["vocab"]) == tensorizer_d.to_bytes(
|
||||
exclude=["vocab"]
|
||||
)
|
||||
|
||||
|
||||
def test_serialize_textcat_empty(en_vocab):
|
||||
# See issue #1105
|
||||
textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
|
||||
textcat.to_bytes()
|
||||
textcat.to_bytes(exclude=["vocab"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("Parser", test_parsers)
|
||||
|
@ -128,13 +130,17 @@ def test_serialize_pipe_exclude(en_vocab, Parser):
|
|||
parser = Parser(en_vocab)
|
||||
parser.model, _ = parser.Model(0)
|
||||
parser.cfg["foo"] = "bar"
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes())
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]))
|
||||
assert "foo" in new_parser.cfg
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(), exclude=["cfg"])
|
||||
new_parser = get_new_parser().from_bytes(
|
||||
parser.to_bytes(exclude=["vocab"]), exclude=["cfg"]
|
||||
)
|
||||
assert "foo" not in new_parser.cfg
|
||||
new_parser = get_new_parser().from_bytes(parser.to_bytes(exclude=["cfg"]))
|
||||
new_parser = get_new_parser().from_bytes(
|
||||
parser.to_bytes(exclude=["cfg"]), exclude=["vocab"]
|
||||
)
|
||||
assert "foo" not in new_parser.cfg
|
||||
with pytest.raises(ValueError):
|
||||
parser.to_bytes(cfg=False)
|
||||
parser.to_bytes(cfg=False, exclude=["vocab"])
|
||||
with pytest.raises(ValueError):
|
||||
get_new_parser().from_bytes(parser.to_bytes(), cfg=False)
|
||||
get_new_parser().from_bytes(parser.to_bytes(exclude=["vocab"]), cfg=False)
|
||||
|
|
|
@ -12,12 +12,14 @@ test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
|||
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.parametrize("text", ["rat"])
|
||||
def test_serialize_vocab(en_vocab, text):
|
||||
text_hash = en_vocab.strings.add(text)
|
||||
vocab_bytes = en_vocab.to_bytes()
|
||||
vocab_bytes = en_vocab.to_bytes(exclude=["lookups"])
|
||||
new_vocab = Vocab().from_bytes(vocab_bytes)
|
||||
assert new_vocab.strings[text_hash] == text
|
||||
assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
|
||||
|
||||
|
||||
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
||||
|
|
|
@ -3,6 +3,9 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
from spacy.lookups import Lookups
|
||||
from spacy.vocab import Vocab
|
||||
|
||||
from ..util import make_tempdir
|
||||
|
||||
|
||||
def test_lookups_api():
|
||||
|
@ -10,6 +13,7 @@ def test_lookups_api():
|
|||
data = {"foo": "bar", "hello": "world"}
|
||||
lookups = Lookups()
|
||||
lookups.add_table(table_name, data)
|
||||
assert len(lookups) == 1
|
||||
assert table_name in lookups
|
||||
assert lookups.has_table(table_name)
|
||||
table = lookups.get_table(table_name)
|
||||
|
@ -22,5 +26,89 @@ def test_lookups_api():
|
|||
assert len(table) == 3
|
||||
with pytest.raises(KeyError):
|
||||
lookups.get_table("xyz")
|
||||
# with pytest.raises(ValueError):
|
||||
# lookups.add_table(table_name)
|
||||
with pytest.raises(ValueError):
|
||||
lookups.add_table(table_name)
|
||||
table = lookups.remove_table(table_name)
|
||||
assert table.name == table_name
|
||||
assert len(lookups) == 0
|
||||
assert table_name not in lookups
|
||||
with pytest.raises(KeyError):
|
||||
lookups.get_table(table_name)
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_bytes():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
||||
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
||||
lookups_bytes = lookups.to_bytes()
|
||||
new_lookups = Lookups()
|
||||
new_lookups.from_bytes(lookups_bytes)
|
||||
assert len(new_lookups) == 2
|
||||
assert "table1" in new_lookups
|
||||
assert "table2" in new_lookups
|
||||
table1 = new_lookups.get_table("table1")
|
||||
assert len(table1) == 2
|
||||
assert table1.get("foo") == "bar"
|
||||
table2 = new_lookups.get_table("table2")
|
||||
assert len(table2) == 3
|
||||
assert table2.get("b") == 2
|
||||
assert new_lookups.to_bytes() == lookups_bytes
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_disk():
|
||||
lookups = Lookups()
|
||||
lookups.add_table("table1", {"foo": "bar", "hello": "world"})
|
||||
lookups.add_table("table2", {"a": 1, "b": 2, "c": 3})
|
||||
with make_tempdir() as tmpdir:
|
||||
lookups.to_disk(tmpdir)
|
||||
new_lookups = Lookups()
|
||||
new_lookups.from_disk(tmpdir)
|
||||
assert len(new_lookups) == 2
|
||||
assert "table1" in new_lookups
|
||||
assert "table2" in new_lookups
|
||||
table1 = new_lookups.get_table("table1")
|
||||
assert len(table1) == 2
|
||||
assert table1.get("foo") == "bar"
|
||||
table2 = new_lookups.get_table("table2")
|
||||
assert len(table2) == 3
|
||||
assert table2.get("b") == 2
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_bytes_via_vocab():
|
||||
table_name = "test"
|
||||
vocab = Vocab()
|
||||
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||
assert len(vocab.lookups) == 1
|
||||
assert table_name in vocab.lookups
|
||||
vocab_bytes = vocab.to_bytes()
|
||||
new_vocab = Vocab()
|
||||
new_vocab.from_bytes(vocab_bytes)
|
||||
assert len(new_vocab.lookups) == 1
|
||||
assert table_name in new_vocab.lookups
|
||||
table = new_vocab.lookups.get_table(table_name)
|
||||
assert len(table) == 2
|
||||
assert table.get("hello") == "world"
|
||||
assert new_vocab.to_bytes() == vocab_bytes
|
||||
|
||||
|
||||
# This fails on Python 3.5
|
||||
@pytest.mark.xfail
|
||||
def test_lookups_to_from_disk_via_vocab():
|
||||
table_name = "test"
|
||||
vocab = Vocab()
|
||||
vocab.lookups.add_table(table_name, {"foo": "bar", "hello": "world"})
|
||||
assert len(vocab.lookups) == 1
|
||||
assert table_name in vocab.lookups
|
||||
with make_tempdir() as tmpdir:
|
||||
vocab.to_disk(tmpdir)
|
||||
new_vocab = Vocab()
|
||||
new_vocab.from_disk(tmpdir)
|
||||
assert len(new_vocab.lookups) == 1
|
||||
assert table_name in new_vocab.lookups
|
||||
table = new_vocab.lookups.get_table(table_name)
|
||||
assert len(table) == 2
|
||||
assert table.get("hello") == "world"
|
||||
|
|
|
@ -16,10 +16,10 @@ cdef class Tokenizer:
|
|||
cdef PreshMap _specials
|
||||
cpdef readonly Vocab vocab
|
||||
|
||||
cdef public object token_match
|
||||
cdef public object prefix_search
|
||||
cdef public object suffix_search
|
||||
cdef public object infix_finditer
|
||||
cdef object _token_match
|
||||
cdef object _prefix_search
|
||||
cdef object _suffix_search
|
||||
cdef object _infix_finditer
|
||||
cdef object _rules
|
||||
|
||||
cpdef Doc tokens_from_list(self, list strings)
|
||||
|
|
|
@ -61,6 +61,38 @@ cdef class Tokenizer:
|
|||
for chunk, substrings in sorted(rules.items()):
|
||||
self.add_special_case(chunk, substrings)
|
||||
|
||||
property token_match:
|
||||
def __get__(self):
|
||||
return self._token_match
|
||||
|
||||
def __set__(self, token_match):
|
||||
self._token_match = token_match
|
||||
self._flush_cache()
|
||||
|
||||
property prefix_search:
|
||||
def __get__(self):
|
||||
return self._prefix_search
|
||||
|
||||
def __set__(self, prefix_search):
|
||||
self._prefix_search = prefix_search
|
||||
self._flush_cache()
|
||||
|
||||
property suffix_search:
|
||||
def __get__(self):
|
||||
return self._suffix_search
|
||||
|
||||
def __set__(self, suffix_search):
|
||||
self._suffix_search = suffix_search
|
||||
self._flush_cache()
|
||||
|
||||
property infix_finditer:
|
||||
def __get__(self):
|
||||
return self._infix_finditer
|
||||
|
||||
def __set__(self, infix_finditer):
|
||||
self._infix_finditer = infix_finditer
|
||||
self._flush_cache()
|
||||
|
||||
def __reduce__(self):
|
||||
args = (self.vocab,
|
||||
self._rules,
|
||||
|
@ -141,9 +173,23 @@ cdef class Tokenizer:
|
|||
for text in texts:
|
||||
yield self(text)
|
||||
|
||||
def _flush_cache(self):
|
||||
self._reset_cache([key for key in self._cache if not key in self._specials])
|
||||
|
||||
def _reset_cache(self, keys):
|
||||
for k in keys:
|
||||
del self._cache[k]
|
||||
if not k in self._specials:
|
||||
cached = <_Cached*>self._cache.get(k)
|
||||
if cached is not NULL:
|
||||
self.mem.free(cached)
|
||||
|
||||
def _reset_specials(self):
|
||||
for k in self._specials:
|
||||
cached = <_Cached*>self._specials.get(k)
|
||||
del self._specials[k]
|
||||
if cached is not NULL:
|
||||
self.mem.free(cached)
|
||||
|
||||
cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
|
||||
cached = <_Cached*>self._cache.get(key)
|
||||
|
@ -183,6 +229,9 @@ cdef class Tokenizer:
|
|||
while string and len(string) != last_size:
|
||||
if self.token_match and self.token_match(string):
|
||||
break
|
||||
if self._specials.get(hash_string(string)) != NULL:
|
||||
has_special[0] = 1
|
||||
break
|
||||
last_size = len(string)
|
||||
pre_len = self.find_prefix(string)
|
||||
if pre_len != 0:
|
||||
|
@ -360,8 +409,15 @@ cdef class Tokenizer:
|
|||
cached.is_lex = False
|
||||
cached.data.tokens = self.vocab.make_fused_token(substrings)
|
||||
key = hash_string(string)
|
||||
stale_special = <_Cached*>self._specials.get(key)
|
||||
stale_cached = <_Cached*>self._cache.get(key)
|
||||
self._flush_cache()
|
||||
self._specials.set(key, cached)
|
||||
self._cache.set(key, cached)
|
||||
if stale_special is not NULL:
|
||||
self.mem.free(stale_special)
|
||||
if stale_special != stale_cached and stale_cached is not NULL:
|
||||
self.mem.free(stale_cached)
|
||||
self._rules[string] = substrings
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
|
@ -444,7 +500,10 @@ cdef class Tokenizer:
|
|||
if data.get("rules"):
|
||||
# make sure to hard reset the cache to remove data from the default exceptions
|
||||
self._rules = {}
|
||||
self._reset_cache([key for key in self._cache])
|
||||
self._reset_specials()
|
||||
self._cache = PreshMap()
|
||||
self._specials = PreshMap()
|
||||
for string, substrings in data.get("rules", {}).items():
|
||||
self.add_special_case(string, substrings)
|
||||
|
||||
|
|
|
@ -131,8 +131,7 @@ def load_language_data(path):
|
|||
path = path.with_suffix(path.suffix + ".gz")
|
||||
if path.exists():
|
||||
return srsly.read_gzip_json(path)
|
||||
# TODO: move to spacy.errors
|
||||
raise ValueError("Can't find language data file: {}".format(path2str(path)))
|
||||
raise ValueError(Errors.E160.format(path=path2str(path)))
|
||||
|
||||
|
||||
def get_module_path(module):
|
||||
|
@ -458,6 +457,14 @@ def expand_exc(excs, search, replace):
|
|||
|
||||
|
||||
def get_lemma_tables(lookups):
|
||||
"""Load lemmatizer data from lookups table. Mostly used via
|
||||
Language.Defaults.create_lemmatizer, but available as helper so it can be
|
||||
reused in language classes that implement custom lemmatizers.
|
||||
|
||||
lookups (Lookups): The lookups table.
|
||||
RETURNS (tuple): A (lemma_rules, lemma_index, lemma_exc, lemma_lookup)
|
||||
tuple that can be used to initialize a Lemmatizer.
|
||||
"""
|
||||
lemma_rules = {}
|
||||
lemma_index = {}
|
||||
lemma_exc = {}
|
||||
|
|
|
@ -43,6 +43,7 @@ cdef class Vocab:
|
|||
lemmatizer (object): A lemmatizer. Defaults to `None`.
|
||||
strings (StringStore): StringStore that maps strings to integers, and
|
||||
vice versa.
|
||||
lookups (Lookups): Container for large lookup tables and dictionaries.
|
||||
RETURNS (Vocab): The newly constructed object.
|
||||
"""
|
||||
lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {}
|
||||
|
@ -433,6 +434,8 @@ cdef class Vocab:
|
|||
file_.write(self.lexemes_to_bytes())
|
||||
if "vectors" not in "exclude" and self.vectors is not None:
|
||||
self.vectors.to_disk(path)
|
||||
if "lookups" not in "exclude" and self.lookups is not None:
|
||||
self.lookups.to_disk(path)
|
||||
|
||||
def from_disk(self, path, exclude=tuple(), **kwargs):
|
||||
"""Loads state from a directory. Modifies the object in place and
|
||||
|
@ -457,6 +460,8 @@ cdef class Vocab:
|
|||
self.vectors.from_disk(path, exclude=["strings"])
|
||||
if self.vectors.name is not None:
|
||||
link_vectors_to_models(self)
|
||||
if "lookups" not in exclude:
|
||||
self.lookups.from_disk(path)
|
||||
return self
|
||||
|
||||
def to_bytes(self, exclude=tuple(), **kwargs):
|
||||
|
@ -477,6 +482,7 @@ cdef class Vocab:
|
|||
("strings", lambda: self.strings.to_bytes()),
|
||||
("lexemes", lambda: self.lexemes_to_bytes()),
|
||||
("vectors", deserialize_vectors),
|
||||
("lookups", lambda: self.lookups.to_bytes())
|
||||
))
|
||||
exclude = util.get_serialization_exclude(getters, exclude, kwargs)
|
||||
return util.to_bytes(getters, exclude)
|
||||
|
@ -500,6 +506,7 @@ cdef class Vocab:
|
|||
("strings", lambda b: self.strings.from_bytes(b)),
|
||||
("lexemes", lambda b: self.lexemes_from_bytes(b)),
|
||||
("vectors", lambda b: serialize_vectors(b)),
|
||||
("lookups", lambda b: self.lookups.from_bytes(b))
|
||||
))
|
||||
exclude = util.get_serialization_exclude(setters, exclude, kwargs)
|
||||
util.from_bytes(bytes_data, setters, exclude)
|
||||
|
|
Loading…
Reference in New Issue
Block a user