mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
37c7c85a86
* Support nowrap setting in util.prints * Tidy up and fix whitespace * Simplify script and use read_jsonl helper * Add JSON schemas (see #2928) * Deprecate Doc.print_tree Will be replaced with Doc.to_json, which will produce a unified format * Add Doc.to_json() method (see #2928) Converts Doc objects to JSON using the same unified format as the training data. Method also supports serializing selected custom attributes in the doc._. space. * Remove outdated test * Add write_json and write_jsonl helpers * WIP: Update spacy train * Tidy up spacy train * WIP: Use wasabi for formatting * Add GoldParse helpers for JSON format * WIP: add debug-data command * Fix typo * Add missing import * Update wasabi pin * Add missing import * 💫 Refactor CLI (#2943) To be merged into #2932. ## Description - [x] refactor CLI To use [`wasabi`](https://github.com/ines/wasabi) - [x] use [`black`](https://github.com/ambv/black) for auto-formatting - [x] add `flake8` config - [x] move all messy UD-related scripts to `cli.ud` - [x] make converters function that take the opened file and return the converted data (instead of having them handle the IO) ### Types of change enhancement ## Checklist <!--- Before you submit the PR, go over this checklist and make sure you can tick off all the boxes. [] -> [x] --> - [x] I have submitted the spaCy Contributor Agreement. - [x] I ran the tests, and all new and existing tests passed. - [x] My changes don't require a change to the documentation, or if they do, I've added all required information. * Update wasabi pin * Delete old test * Update errors * Fix typo * Tidy up and format remaining code * Fix formatting * Improve formatting of messages * Auto-format remaining code * Add tok2vec stuff to spacy.train * Fix typo * Update wasabi pin * Fix path checks for when train() is called as function * Reformat and tidy up pretrain script * Update argument annotations * Raise error if model language doesn't match lang * Document new train command
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
# coding: utf-8
|
|
from __future__ import unicode_literals
|
|
|
|
import pytest
|
|
from pathlib import Path
|
|
from spacy import util
|
|
from spacy import displacy
|
|
from spacy import prefer_gpu, require_gpu
|
|
from spacy.tokens import Span
|
|
from spacy._ml import PrecomputableAffine
|
|
|
|
from .util import get_doc
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["hello/world", "hello world"])
|
|
def test_util_ensure_path_succeeds(text):
|
|
path = util.ensure_path(text)
|
|
assert isinstance(path, Path)
|
|
|
|
|
|
@pytest.mark.parametrize("package", ["numpy"])
|
|
def test_util_is_package(package):
|
|
"""Test that an installed package via pip is recognised by util.is_package."""
|
|
assert util.is_package(package)
|
|
|
|
|
|
@pytest.mark.parametrize("package", ["thinc"])
|
|
def test_util_get_package_path(package):
|
|
"""Test that a Path object is returned for a package name."""
|
|
path = util.get_package_path(package)
|
|
assert isinstance(path, Path)
|
|
|
|
|
|
def test_displacy_parse_ents(en_vocab):
|
|
"""Test that named entities on a Doc are converted into displaCy's format."""
|
|
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
|
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
|
ents = displacy.parse_ents(doc)
|
|
assert isinstance(ents, dict)
|
|
assert ents["text"] == "But Google is starting from behind "
|
|
assert ents["ents"] == [{"start": 4, "end": 10, "label": "ORG"}]
|
|
|
|
|
|
def test_displacy_parse_deps(en_vocab):
|
|
"""Test that deps and tags on a Doc are converted into displaCy's format."""
|
|
words = ["This", "is", "a", "sentence"]
|
|
heads = [1, 0, 1, -2]
|
|
pos = ["DET", "VERB", "DET", "NOUN"]
|
|
tags = ["DT", "VBZ", "DT", "NN"]
|
|
deps = ["nsubj", "ROOT", "det", "attr"]
|
|
doc = get_doc(en_vocab, words=words, heads=heads, pos=pos, tags=tags, deps=deps)
|
|
deps = displacy.parse_deps(doc)
|
|
assert isinstance(deps, dict)
|
|
assert deps["words"] == [
|
|
{"text": "This", "tag": "DET"},
|
|
{"text": "is", "tag": "VERB"},
|
|
{"text": "a", "tag": "DET"},
|
|
{"text": "sentence", "tag": "NOUN"},
|
|
]
|
|
assert deps["arcs"] == [
|
|
{"start": 0, "end": 1, "label": "nsubj", "dir": "left"},
|
|
{"start": 2, "end": 3, "label": "det", "dir": "left"},
|
|
{"start": 1, "end": 3, "label": "attr", "dir": "right"},
|
|
]
|
|
|
|
|
|
def test_displacy_spans(en_vocab):
|
|
"""Test that displaCy can render Spans."""
|
|
doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"])
|
|
doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])]
|
|
html = displacy.render(doc[1:4], style="ent")
|
|
assert html.startswith("<div")
|
|
|
|
|
|
def test_displacy_raises_for_wrong_type(en_vocab):
|
|
with pytest.raises(ValueError):
|
|
displacy.render("hello world")
|
|
|
|
|
|
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP)
|
|
assert model.W.shape == (nF, nO, nP, nI)
|
|
tensor = model.ops.allocate((10, nI))
|
|
Y, get_dX = model.begin_update(tensor)
|
|
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
|
|
assert model.d_pad.shape == (1, nF, nO, nP)
|
|
dY = model.ops.allocate((15, nO, nP))
|
|
ids = model.ops.allocate((15, nF))
|
|
ids[1, 2] = -1
|
|
dY[1] = 1
|
|
assert model.d_pad[0, 2, 0, 0] == 0.0
|
|
model._backprop_padding(dY, ids)
|
|
assert model.d_pad[0, 2, 0, 0] == 1.0
|
|
model.d_pad.fill(0.0)
|
|
ids.fill(0.0)
|
|
dY.fill(0.0)
|
|
ids[1, 2] = -1
|
|
ids[1, 1] = -1
|
|
ids[1, 0] = -1
|
|
dY[1] = 1
|
|
assert model.d_pad[0, 2, 0, 0] == 0.0
|
|
model._backprop_padding(dY, ids)
|
|
assert model.d_pad[0, 2, 0, 0] == 3.0
|
|
|
|
|
|
def test_prefer_gpu():
|
|
assert not prefer_gpu()
|
|
|
|
|
|
def test_require_gpu():
|
|
with pytest.raises(ValueError):
|
|
require_gpu()
|