Merge remote-tracking branch 'upstream/master' into feature/hashmatcher

This commit is contained in:
Adriane Boyd 2019-09-19 17:42:17 +02:00
commit 3931368ce8
145 changed files with 13767 additions and 1850 deletions

View File

@ -1,7 +1,17 @@
SHELL := /bin/bash SHELL := /bin/bash
sha = $(shell "git" "rev-parse" "--short" "HEAD") sha = $(shell "git" "rev-parse" "--short" "HEAD")
version = $(shell "bin/get-version.sh")
wheel = spacy-$(version)-cp36-cp36m-linux_x86_64.whl
dist/spacy.pex : spacy/*.py* spacy/*/*.py* dist/spacy.pex : dist/spacy-$(sha).pex
cp dist/spacy-$(sha).pex dist/spacy.pex
chmod a+rx dist/spacy.pex
dist/spacy-$(sha).pex : dist/$(wheel)
env3.6/bin/python -m pip install pex==1.5.3
env3.6/bin/pex pytest dist/$(wheel) -e spacy -o dist/spacy-$(sha).pex
dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py*
python3.6 -m venv env3.6 python3.6 -m venv env3.6
source env3.6/bin/activate source env3.6/bin/activate
env3.6/bin/pip install wheel env3.6/bin/pip install wheel
@ -9,10 +19,6 @@ dist/spacy.pex : spacy/*.py* spacy/*/*.py*
env3.6/bin/python setup.py build_ext --inplace env3.6/bin/python setup.py build_ext --inplace
env3.6/bin/python setup.py sdist env3.6/bin/python setup.py sdist
env3.6/bin/python setup.py bdist_wheel env3.6/bin/python setup.py bdist_wheel
env3.6/bin/python -m pip install pex==1.5.3
env3.6/bin/pex pytest dist/*.whl -e spacy -o dist/spacy-$(sha).pex
cp dist/spacy-$(sha).pex dist/spacy.pex
chmod a+rx dist/spacy.pex
.PHONY : clean .PHONY : clean

View File

@ -38,7 +38,7 @@ It's commercial open-source software, released under the MIT license.
| [Contribute] | How to contribute to the spaCy project and code base. | | [Contribute] | How to contribute to the spaCy project and code base. |
[spacy 101]: https://spacy.io/usage/spacy-101 [spacy 101]: https://spacy.io/usage/spacy-101
[new in v2.1]: https://spacy.io/usage/v2-1 [new in v2.2]: https://spacy.io/usage/v2-1
[usage guides]: https://spacy.io/usage/ [usage guides]: https://spacy.io/usage/
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[models]: https://spacy.io/models [models]: https://spacy.io/models
@ -49,9 +49,12 @@ It's commercial open-source software, released under the MIT license.
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by [@honnibal](https://github.com/honnibal) The spaCy project is maintained by [@honnibal](https://github.com/honnibal)
and [@ines](https://github.com/ines). Please understand that we won't be able and [@ines](https://github.com/ines), along with core contributors
to provide individual support via email. We also believe that help is much more [@svlandeg](https://github.com/svlandeg) and
valuable if it's shared publicly, so that more people can benefit from it. [@adrianeboyd](https://github.com/adrianeboyd). Please understand that we won't
be able to provide individual support via email. We also believe that help is
much more valuable if it's shared publicly, so that more people can benefit
from it.
| Type | Platforms | | Type | Platforms |
| ------------------------ | ------------------------------------------------------ | | ------------------------ | ------------------------------------------------------ |
@ -172,8 +175,8 @@ python -m spacy download en_core_web_sm
python -m spacy download en python -m spacy download en
# pip install .tar.gz archive from path or URL # pip install .tar.gz archive from path or URL
pip install /Users/you/en_core_web_sm-2.1.0.tar.gz pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
``` ```
### Loading and using models ### Loading and using models

12
bin/get-version.sh Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env bash
set -e
version=$(grep "__version__ = " spacy/about.py)
version=${version/__version__ = }
version=${version/\'/}
version=${version/\'/}
version=${version/\"/}
version=${version/\"/}
echo $version

View File

@ -82,6 +82,8 @@ def read_data(
head = int(head) - 1 if head != "0" else id_ head = int(head) - 1 if head != "0" else id_
sent["words"].append(word) sent["words"].append(word)
sent["tags"].append(tag) sent["tags"].append(tag)
sent["morphology"].append(_parse_morph_string(morph))
sent["morphology"][-1].add("POS_%s" % pos)
sent["heads"].append(head) sent["heads"].append(head)
sent["deps"].append("ROOT" if dep == "root" else dep) sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_") sent["spaces"].append(space_after == "_")
@ -90,10 +92,12 @@ def read_data(
if oracle_segments: if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(GoldParse(docs[-1], **sent)) golds.append(GoldParse(docs[-1], **sent))
assert golds[-1].morphology is not None
sent_annots.append(sent) sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots) doc, gold = _make_gold(nlp, None, sent_annots)
assert gold.morphology is not None
sent_annots = [] sent_annots = []
docs.append(doc) docs.append(doc)
golds.append(gold) golds.append(gold)
@ -108,6 +112,17 @@ def read_data(
return docs, golds return docs, golds
return docs, golds return docs, golds
def _parse_morph_string(morph_string):
if morph_string == '_':
return set()
output = []
replacements = {'1': 'one', '2': 'two', '3': 'three'}
for feature in morph_string.split('|'):
key, value = feature.split('=')
value = replacements.get(value, value)
value = value.split(',')[0]
output.append('%s_%s' % (key, value.lower()))
return set(output)
def read_conllu(file_): def read_conllu(file_):
docs = [] docs = []
@ -141,8 +156,8 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
flat = defaultdict(list) flat = defaultdict(list)
sent_starts = [] sent_starts = []
for sent in sent_annots: for sent in sent_annots:
flat["heads"].extend(len(flat["words"]) + head for head in sent["heads"]) flat["heads"].extend(len(flat["words"])+head for head in sent["heads"])
for field in ["words", "tags", "deps", "entities", "spaces"]: for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
flat[field].extend(sent[field]) flat[field].extend(sent[field])
sent_starts.append(True) sent_starts.append(True)
sent_starts.extend([False] * (len(sent["words"]) - 1)) sent_starts.extend([False] * (len(sent["words"]) - 1))
@ -216,9 +231,14 @@ def write_conllu(docs, file_):
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
matches = merger(doc) matches = merger(doc)
spans = [doc[start : end + 1] for _, start, end in matches] spans = [doc[start : end + 1] for _, start, end in matches]
seen_tokens = set()
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
for span in spans: for span in spans:
span_tokens = set(range(span.start, span.end))
if not span_tokens.intersection(seen_tokens):
retokenizer.merge(span) retokenizer.merge(span)
seen_tokens.update(span_tokens)
file_.write("# newdoc id = {i}\n".format(i=i)) file_.write("# newdoc id = {i}\n".format(i=i))
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):
file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j))
@ -241,27 +261,29 @@ def write_conllu(docs, file_):
def print_progress(itn, losses, ud_scores): def print_progress(itn, losses, ud_scores):
fields = { fields = {
"dep_loss": losses.get("parser", 0.0), "dep_loss": losses.get("parser", 0.0),
"morph_loss": losses.get("morphologizer", 0.0),
"tag_loss": losses.get("tagger", 0.0), "tag_loss": losses.get("tagger", 0.0),
"words": ud_scores["Words"].f1 * 100, "words": ud_scores["Words"].f1 * 100,
"sents": ud_scores["Sentences"].f1 * 100, "sents": ud_scores["Sentences"].f1 * 100,
"tags": ud_scores["XPOS"].f1 * 100, "tags": ud_scores["XPOS"].f1 * 100,
"uas": ud_scores["UAS"].f1 * 100, "uas": ud_scores["UAS"].f1 * 100,
"las": ud_scores["LAS"].f1 * 100, "las": ud_scores["LAS"].f1 * 100,
"morph": ud_scores["Feats"].f1 * 100,
} }
header = ["Epoch", "Loss", "LAS", "UAS", "TAG", "SENT", "WORD"] header = ["Epoch", "P.Loss", "M.Loss", "LAS", "UAS", "TAG", "MORPH", "SENT", "WORD"]
if itn == 0: if itn == 0:
print("\t".join(header)) print("\t".join(header))
tpl = "\t".join( tpl = "\t".join((
(
"{:d}", "{:d}",
"{dep_loss:.1f}", "{dep_loss:.1f}",
"{morph_loss:.1f}",
"{las:.1f}", "{las:.1f}",
"{uas:.1f}", "{uas:.1f}",
"{tags:.1f}", "{tags:.1f}",
"{morph:.1f}",
"{sents:.1f}", "{sents:.1f}",
"{words:.1f}", "{words:.1f}",
) ))
)
print(tpl.format(itn, **fields)) print(tpl.format(itn, **fields))
@ -282,22 +304,23 @@ def get_token_conllu(token, i):
head = 0 head = 0
else: else:
head = i + (token.head.i - token.i) + 1 head = i + (token.head.i - token.i) + 1
fields = [ features = list(token.morph)
str(i + 1), feat_str = []
token.text, replacements = {"one": "1", "two": "2", "three": "3"}
token.lemma_, for feat in features:
token.pos_, if not feat.startswith("begin") and not feat.startswith("end"):
token.tag_, key, value = feat.split("_", 1)
"_", value = replacements.get(value, value)
str(head), feat_str.append("%s=%s" % (key, value.title()))
token.dep_.lower(), if not feat_str:
"_", feat_str = "_"
"_", else:
] feat_str = "|".join(feat_str)
fields = [str(i+1), token.text, token.lemma_, token.pos_, token.tag_, feat_str,
str(head), token.dep_.lower(), "_", "_"]
lines.append("\t".join(fields)) lines.append("\t".join(fields))
return "\n".join(lines) return "\n".join(lines)
Token.set_extension("get_conllu_lines", method=get_token_conllu) Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False) Token.set_extension("inside_fused", default=False)
@ -324,7 +347,8 @@ def load_nlp(corpus, config, vectors=None):
def initialize_pipeline(nlp, docs, golds, config, device): def initialize_pipeline(nlp, docs, golds, config, device):
nlp.add_pipe(nlp.create_pipe("tagger")) nlp.add_pipe(nlp.create_pipe("tagger", config={"set_morphology": False}))
nlp.add_pipe(nlp.create_pipe("morphologizer"))
nlp.add_pipe(nlp.create_pipe("parser")) nlp.add_pipe(nlp.create_pipe("parser"))
if config.multitask_tag: if config.multitask_tag:
nlp.parser.add_multitask_objective("tag") nlp.parser.add_multitask_objective("tag")
@ -524,13 +548,11 @@ def main(
out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i) out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
if use_oracle_segments: if use_oracle_segments:
parsed_docs, scores = evaluate( parsed_docs, scores = evaluate(nlp, paths.dev.conllu,
nlp, paths.dev.conllu, paths.dev.conllu, out_path paths.dev.conllu, out_path)
)
else: else:
parsed_docs, scores = evaluate( parsed_docs, scores = evaluate(nlp, paths.dev.text,
nlp, paths.dev.text, paths.dev.conllu, out_path paths.dev.conllu, out_path)
)
print_progress(i, losses, scores) print_progress(i, losses, scores)

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb * Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy vX.X Compatible with: spaCy v2.2
Last tested with: vX.X Last tested with: v2.2
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -73,7 +73,6 @@ def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
input_dim=INPUT_DIM, input_dim=INPUT_DIM,
desc_width=DESC_WIDTH, desc_width=DESC_WIDTH,
epochs=n_iter, epochs=n_iter,
threshold=0.001,
) )
encoder.train(description_list=descriptions, to_print=True) encoder.train(description_list=descriptions, to_print=True)

View File

@ -0,0 +1,121 @@
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.

View File

@ -0,0 +1,359 @@
Creative Commons Legal Code
Attribution-ShareAlike 3.0 Unported
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR
DAMAGES RESULTING FROM ITS USE.
License
THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE
COMMONS PUBLIC LICENSE ("CCPL" OR "LICENSE"). THE WORK IS PROTECTED BY
COPYRIGHT AND/OR OTHER APPLICABLE LAW. ANY USE OF THE WORK OTHER THAN AS
AUTHORIZED UNDER THIS LICENSE OR COPYRIGHT LAW IS PROHIBITED.
BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE
TO BE BOUND BY THE TERMS OF THIS LICENSE. TO THE EXTENT THIS LICENSE MAY
BE CONSIDERED TO BE A CONTRACT, THE LICENSOR GRANTS YOU THE RIGHTS
CONTAINED HERE IN CONSIDERATION OF YOUR ACCEPTANCE OF SUCH TERMS AND
CONDITIONS.
1. Definitions
a. "Adaptation" means a work based upon the Work, or upon the Work and
other pre-existing works, such as a translation, adaptation,
derivative work, arrangement of music or other alterations of a
literary or artistic work, or phonogram or performance and includes
cinematographic adaptations or any other form in which the Work may be
recast, transformed, or adapted including in any form recognizably
derived from the original, except that a work that constitutes a
Collection will not be considered an Adaptation for the purpose of
this License. For the avoidance of doubt, where the Work is a musical
work, performance or phonogram, the synchronization of the Work in
timed-relation with a moving image ("synching") will be considered an
Adaptation for the purpose of this License.
b. "Collection" means a collection of literary or artistic works, such as
encyclopedias and anthologies, or performances, phonograms or
broadcasts, or other works or subject matter other than works listed
in Section 1(f) below, which, by reason of the selection and
arrangement of their contents, constitute intellectual creations, in
which the Work is included in its entirety in unmodified form along
with one or more other contributions, each constituting separate and
independent works in themselves, which together are assembled into a
collective whole. A work that constitutes a Collection will not be
considered an Adaptation (as defined below) for the purposes of this
License.
c. "Creative Commons Compatible License" means a license that is listed
at https://creativecommons.org/compatiblelicenses that has been
approved by Creative Commons as being essentially equivalent to this
License, including, at a minimum, because that license: (i) contains
terms that have the same purpose, meaning and effect as the License
Elements of this License; and, (ii) explicitly permits the relicensing
of adaptations of works made available under that license under this
License or a Creative Commons jurisdiction license with the same
License Elements as this License.
d. "Distribute" means to make available to the public the original and
copies of the Work or Adaptation, as appropriate, through sale or
other transfer of ownership.
e. "License Elements" means the following high-level license attributes
as selected by Licensor and indicated in the title of this License:
Attribution, ShareAlike.
f. "Licensor" means the individual, individuals, entity or entities that
offer(s) the Work under the terms of this License.
g. "Original Author" means, in the case of a literary or artistic work,
the individual, individuals, entity or entities who created the Work
or if no individual or entity can be identified, the publisher; and in
addition (i) in the case of a performance the actors, singers,
musicians, dancers, and other persons who act, sing, deliver, declaim,
play in, interpret or otherwise perform literary or artistic works or
expressions of folklore; (ii) in the case of a phonogram the producer
being the person or legal entity who first fixes the sounds of a
performance or other sounds; and, (iii) in the case of broadcasts, the
organization that transmits the broadcast.
h. "Work" means the literary and/or artistic work offered under the terms
of this License including without limitation any production in the
literary, scientific and artistic domain, whatever may be the mode or
form of its expression including digital form, such as a book,
pamphlet and other writing; a lecture, address, sermon or other work
of the same nature; a dramatic or dramatico-musical work; a
choreographic work or entertainment in dumb show; a musical
composition with or without words; a cinematographic work to which are
assimilated works expressed by a process analogous to cinematography;
a work of drawing, painting, architecture, sculpture, engraving or
lithography; a photographic work to which are assimilated works
expressed by a process analogous to photography; a work of applied
art; an illustration, map, plan, sketch or three-dimensional work
relative to geography, topography, architecture or science; a
performance; a broadcast; a phonogram; a compilation of data to the
extent it is protected as a copyrightable work; or a work performed by
a variety or circus performer to the extent it is not otherwise
considered a literary or artistic work.
i. "You" means an individual or entity exercising rights under this
License who has not previously violated the terms of this License with
respect to the Work, or who has received express permission from the
Licensor to exercise rights under this License despite a previous
violation.
j. "Publicly Perform" means to perform public recitations of the Work and
to communicate to the public those public recitations, by any means or
process, including by wire or wireless means or public digital
performances; to make available to the public Works in such a way that
members of the public may access these Works from a place and at a
place individually chosen by them; to perform the Work to the public
by any means or process and the communication to the public of the
performances of the Work, including by public digital performance; to
broadcast and rebroadcast the Work by any means including signs,
sounds or images.
k. "Reproduce" means to make copies of the Work by any means including
without limitation by sound or visual recordings and the right of
fixation and reproducing fixations of the Work, including storage of a
protected performance or phonogram in digital form or other electronic
medium.
2. Fair Dealing Rights. Nothing in this License is intended to reduce,
limit, or restrict any uses free from copyright or rights arising from
limitations or exceptions that are provided for in connection with the
copyright protection under copyright law or other applicable laws.
3. License Grant. Subject to the terms and conditions of this License,
Licensor hereby grants You a worldwide, royalty-free, non-exclusive,
perpetual (for the duration of the applicable copyright) license to
exercise the rights in the Work as stated below:
a. to Reproduce the Work, to incorporate the Work into one or more
Collections, and to Reproduce the Work as incorporated in the
Collections;
b. to create and Reproduce Adaptations provided that any such Adaptation,
including any translation in any medium, takes reasonable steps to
clearly label, demarcate or otherwise identify that changes were made
to the original Work. For example, a translation could be marked "The
original work was translated from English to Spanish," or a
modification could indicate "The original work has been modified.";
c. to Distribute and Publicly Perform the Work including as incorporated
in Collections; and,
d. to Distribute and Publicly Perform Adaptations.
e. For the avoidance of doubt:
i. Non-waivable Compulsory License Schemes. In those jurisdictions in
which the right to collect royalties through any statutory or
compulsory licensing scheme cannot be waived, the Licensor
reserves the exclusive right to collect such royalties for any
exercise by You of the rights granted under this License;
ii. Waivable Compulsory License Schemes. In those jurisdictions in
which the right to collect royalties through any statutory or
compulsory licensing scheme can be waived, the Licensor waives the
exclusive right to collect such royalties for any exercise by You
of the rights granted under this License; and,
iii. Voluntary License Schemes. The Licensor waives the right to
collect royalties, whether individually or, in the event that the
Licensor is a member of a collecting society that administers
voluntary licensing schemes, via that society, from any exercise
by You of the rights granted under this License.
The above rights may be exercised in all media and formats whether now
known or hereafter devised. The above rights include the right to make
such modifications as are technically necessary to exercise the rights in
other media and formats. Subject to Section 8(f), all rights not expressly
granted by Licensor are hereby reserved.
4. Restrictions. The license granted in Section 3 above is expressly made
subject to and limited by the following restrictions:
a. You may Distribute or Publicly Perform the Work only under the terms
of this License. You must include a copy of, or the Uniform Resource
Identifier (URI) for, this License with every copy of the Work You
Distribute or Publicly Perform. You may not offer or impose any terms
on the Work that restrict the terms of this License or the ability of
the recipient of the Work to exercise the rights granted to that
recipient under the terms of the License. You may not sublicense the
Work. You must keep intact all notices that refer to this License and
to the disclaimer of warranties with every copy of the Work You
Distribute or Publicly Perform. When You Distribute or Publicly
Perform the Work, You may not impose any effective technological
measures on the Work that restrict the ability of a recipient of the
Work from You to exercise the rights granted to that recipient under
the terms of the License. This Section 4(a) applies to the Work as
incorporated in a Collection, but this does not require the Collection
apart from the Work itself to be made subject to the terms of this
License. If You create a Collection, upon notice from any Licensor You
must, to the extent practicable, remove from the Collection any credit
as required by Section 4(c), as requested. If You create an
Adaptation, upon notice from any Licensor You must, to the extent
practicable, remove from the Adaptation any credit as required by
Section 4(c), as requested.
b. You may Distribute or Publicly Perform an Adaptation only under the
terms of: (i) this License; (ii) a later version of this License with
the same License Elements as this License; (iii) a Creative Commons
jurisdiction license (either this or a later license version) that
contains the same License Elements as this License (e.g.,
Attribution-ShareAlike 3.0 US)); (iv) a Creative Commons Compatible
License. If you license the Adaptation under one of the licenses
mentioned in (iv), you must comply with the terms of that license. If
you license the Adaptation under the terms of any of the licenses
mentioned in (i), (ii) or (iii) (the "Applicable License"), you must
comply with the terms of the Applicable License generally and the
following provisions: (I) You must include a copy of, or the URI for,
the Applicable License with every copy of each Adaptation You
Distribute or Publicly Perform; (II) You may not offer or impose any
terms on the Adaptation that restrict the terms of the Applicable
License or the ability of the recipient of the Adaptation to exercise
the rights granted to that recipient under the terms of the Applicable
License; (III) You must keep intact all notices that refer to the
Applicable License and to the disclaimer of warranties with every copy
of the Work as included in the Adaptation You Distribute or Publicly
Perform; (IV) when You Distribute or Publicly Perform the Adaptation,
You may not impose any effective technological measures on the
Adaptation that restrict the ability of a recipient of the Adaptation
from You to exercise the rights granted to that recipient under the
terms of the Applicable License. This Section 4(b) applies to the
Adaptation as incorporated in a Collection, but this does not require
the Collection apart from the Adaptation itself to be made subject to
the terms of the Applicable License.
c. If You Distribute, or Publicly Perform the Work or any Adaptations or
Collections, You must, unless a request has been made pursuant to
Section 4(a), keep intact all copyright notices for the Work and
provide, reasonable to the medium or means You are utilizing: (i) the
name of the Original Author (or pseudonym, if applicable) if supplied,
and/or if the Original Author and/or Licensor designate another party
or parties (e.g., a sponsor institute, publishing entity, journal) for
attribution ("Attribution Parties") in Licensor's copyright notice,
terms of service or by other reasonable means, the name of such party
or parties; (ii) the title of the Work if supplied; (iii) to the
extent reasonably practicable, the URI, if any, that Licensor
specifies to be associated with the Work, unless such URI does not
refer to the copyright notice or licensing information for the Work;
and (iv) , consistent with Ssection 3(b), in the case of an
Adaptation, a credit identifying the use of the Work in the Adaptation
(e.g., "French translation of the Work by Original Author," or
"Screenplay based on original Work by Original Author"). The credit
required by this Section 4(c) may be implemented in any reasonable
manner; provided, however, that in the case of a Adaptation or
Collection, at a minimum such credit will appear, if a credit for all
contributing authors of the Adaptation or Collection appears, then as
part of these credits and in a manner at least as prominent as the
credits for the other contributing authors. For the avoidance of
doubt, You may only use the credit required by this Section for the
purpose of attribution in the manner set out above and, by exercising
Your rights under this License, You may not implicitly or explicitly
assert or imply any connection with, sponsorship or endorsement by the
Original Author, Licensor and/or Attribution Parties, as appropriate,
of You or Your use of the Work, without the separate, express prior
written permission of the Original Author, Licensor and/or Attribution
Parties.
d. Except as otherwise agreed in writing by the Licensor or as may be
otherwise permitted by applicable law, if You Reproduce, Distribute or
Publicly Perform the Work either by itself or as part of any
Adaptations or Collections, You must not distort, mutilate, modify or
take other derogatory action in relation to the Work which would be
prejudicial to the Original Author's honor or reputation. Licensor
agrees that in those jurisdictions (e.g. Japan), in which any exercise
of the right granted in Section 3(b) of this License (the right to
make Adaptations) would be deemed to be a distortion, mutilation,
modification or other derogatory action prejudicial to the Original
Author's honor and reputation, the Licensor will waive or not assert,
as appropriate, this Section, to the fullest extent permitted by the
applicable national law, to enable You to reasonably exercise Your
right under Section 3(b) of this License (right to make Adaptations)
but not otherwise.
5. Representations, Warranties and Disclaimer
UNLESS OTHERWISE MUTUALLY AGREED TO BY THE PARTIES IN WRITING, LICENSOR
OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY
KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE,
INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTIBILITY,
FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF
LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ABSENCE OF ERRORS,
WHETHER OR NOT DISCOVERABLE. SOME JURISDICTIONS DO NOT ALLOW THE EXCLUSION
OF IMPLIED WARRANTIES, SO SUCH EXCLUSION MAY NOT APPLY TO YOU.
6. Limitation on Liability. EXCEPT TO THE EXTENT REQUIRED BY APPLICABLE
LAW, IN NO EVENT WILL LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR
ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES
ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF LICENSOR HAS
BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
7. Termination
a. This License and the rights granted hereunder will terminate
automatically upon any breach by You of the terms of this License.
Individuals or entities who have received Adaptations or Collections
from You under this License, however, will not have their licenses
terminated provided such individuals or entities remain in full
compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will
survive any termination of this License.
b. Subject to the above terms and conditions, the license granted here is
perpetual (for the duration of the applicable copyright in the Work).
Notwithstanding the above, Licensor reserves the right to release the
Work under different license terms or to stop distributing the Work at
any time; provided, however that any such election will not serve to
withdraw this License (or any other license that has been, or is
required to be, granted under the terms of this License), and this
License will continue in full force and effect unless terminated as
stated above.
8. Miscellaneous
a. Each time You Distribute or Publicly Perform the Work or a Collection,
the Licensor offers to the recipient a license to the Work on the same
terms and conditions as the license granted to You under this License.
b. Each time You Distribute or Publicly Perform an Adaptation, Licensor
offers to the recipient a license to the original Work on the same
terms and conditions as the license granted to You under this License.
c. If any provision of this License is invalid or unenforceable under
applicable law, it shall not affect the validity or enforceability of
the remainder of the terms of this License, and without further action
by the parties to this agreement, such provision shall be reformed to
the minimum extent necessary to make such provision valid and
enforceable.
d. No term or provision of this License shall be deemed waived and no
breach consented to unless such waiver or consent shall be in writing
and signed by the party to be charged with such waiver or consent.
e. This License constitutes the entire agreement between the parties with
respect to the Work licensed here. There are no understandings,
agreements or representations with respect to the Work not specified
here. Licensor shall not be bound by any additional provisions that
may appear in any communication from You. This License may not be
modified without the mutual written agreement of the Licensor and You.
f. The rights granted under, and the subject matter referenced, in this
License were drafted utilizing the terminology of the Berne Convention
for the Protection of Literary and Artistic Works (as amended on
September 28, 1979), the Rome Convention of 1961, the WIPO Copyright
Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996
and the Universal Copyright Convention (as revised on July 24, 1971).
These rights and subject matter take effect in the relevant
jurisdiction in which the License terms are sought to be enforced
according to the corresponding provisions of the implementation of
those treaty provisions in the applicable national law. If the
standard suite of rights granted under applicable copyright law
includes additional rights not granted under this License, such
additional rights are deemed to be included in the License; this
License is not intended to restrict the license of any rights under
applicable law.
Creative Commons Notice
Creative Commons is not a party to this License, and makes no warranty
whatsoever in connection with the Work. Creative Commons will not be
liable to You or any party on any legal theory for any damages
whatsoever, including without limitation any general, special,
incidental or consequential damages arising in connection to this
license. Notwithstanding the foregoing two (2) sentences, if Creative
Commons has expressly identified itself as the Licensor hereunder, it
shall have all rights and obligations of Licensor.
Except for the limited purpose of indicating to the public that the
Work is licensed under the CCPL, Creative Commons does not authorize
the use by either party of the trademark "Creative Commons" or any
related trademark or logo of Creative Commons without the prior
written consent of Creative Commons. Any permitted use will be in
compliance with Creative Commons' then-current trademark usage
guidelines, as may be published on its website or otherwise made
available upon request from time to time. For the avoidance of doubt,
this trademark restriction does not form part of the License.
Creative Commons may be contacted at https://creativecommons.org/.

View File

@ -0,0 +1,428 @@
Attribution-ShareAlike 4.0 International
=======================================================================
Creative Commons Corporation ("Creative Commons") is not a law firm and
does not provide legal services or legal advice. Distribution of
Creative Commons public licenses does not create a lawyer-client or
other relationship. Creative Commons makes its licenses and related
information available on an "as-is" basis. Creative Commons gives no
warranties regarding its licenses, any material licensed under their
terms and conditions, or any related information. Creative Commons
disclaims all liability for damages resulting from their use to the
fullest extent possible.
Using Creative Commons Public Licenses
Creative Commons public licenses provide a standard set of terms and
conditions that creators and other rights holders may use to share
original works of authorship and other material subject to copyright
and certain other rights specified in the public license below. The
following considerations are for informational purposes only, are not
exhaustive, and do not form part of our licenses.
Considerations for licensors: Our public licenses are
intended for use by those authorized to give the public
permission to use material in ways otherwise restricted by
copyright and certain other rights. Our licenses are
irrevocable. Licensors should read and understand the terms
and conditions of the license they choose before applying it.
Licensors should also secure all rights necessary before
applying our licenses so that the public can reuse the
material as expected. Licensors should clearly mark any
material not subject to the license. This includes other CC-
licensed material, or material used under an exception or
limitation to copyright. More considerations for licensors:
wiki.creativecommons.org/Considerations_for_licensors
Considerations for the public: By using one of our public
licenses, a licensor grants the public permission to use the
licensed material under specified terms and conditions. If
the licensor's permission is not necessary for any reason--for
example, because of any applicable exception or limitation to
copyright--then that use is not regulated by the license. Our
licenses grant only permissions under copyright and certain
other rights that a licensor has authority to grant. Use of
the licensed material may still be restricted for other
reasons, including because others have copyright or other
rights in the material. A licensor may make special requests,
such as asking that all changes be marked or described.
Although not required by our licenses, you are encouraged to
respect those requests where reasonable. More considerations
for the public:
wiki.creativecommons.org/Considerations_for_licensees
=======================================================================
Creative Commons Attribution-ShareAlike 4.0 International Public
License
By exercising the Licensed Rights (defined below), You accept and agree
to be bound by the terms and conditions of this Creative Commons
Attribution-ShareAlike 4.0 International Public License ("Public
License"). To the extent this Public License may be interpreted as a
contract, You are granted the Licensed Rights in consideration of Your
acceptance of these terms and conditions, and the Licensor grants You
such rights in consideration of benefits the Licensor receives from
making the Licensed Material available under these terms and
conditions.
Section 1 -- Definitions.
a. Adapted Material means material subject to Copyright and Similar
Rights that is derived from or based upon the Licensed Material
and in which the Licensed Material is translated, altered,
arranged, transformed, or otherwise modified in a manner requiring
permission under the Copyright and Similar Rights held by the
Licensor. For purposes of this Public License, where the Licensed
Material is a musical work, performance, or sound recording,
Adapted Material is always produced where the Licensed Material is
synched in timed relation with a moving image.
b. Adapter's License means the license You apply to Your Copyright
and Similar Rights in Your contributions to Adapted Material in
accordance with the terms and conditions of this Public License.
c. BY-SA Compatible License means a license listed at
creativecommons.org/compatiblelicenses, approved by Creative
Commons as essentially the equivalent of this Public License.
d. Copyright and Similar Rights means copyright and/or similar rights
closely related to copyright including, without limitation,
performance, broadcast, sound recording, and Sui Generis Database
Rights, without regard to how the rights are labeled or
categorized. For purposes of this Public License, the rights
specified in Section 2(b)(1)-(2) are not Copyright and Similar
Rights.
e. Effective Technological Measures means those measures that, in the
absence of proper authority, may not be circumvented under laws
fulfilling obligations under Article 11 of the WIPO Copyright
Treaty adopted on December 20, 1996, and/or similar international
agreements.
f. Exceptions and Limitations means fair use, fair dealing, and/or
any other exception or limitation to Copyright and Similar Rights
that applies to Your use of the Licensed Material.
g. License Elements means the license attributes listed in the name
of a Creative Commons Public License. The License Elements of this
Public License are Attribution and ShareAlike.
h. Licensed Material means the artistic or literary work, database,
or other material to which the Licensor applied this Public
License.
i. Licensed Rights means the rights granted to You subject to the
terms and conditions of this Public License, which are limited to
all Copyright and Similar Rights that apply to Your use of the
Licensed Material and that the Licensor has authority to license.
j. Licensor means the individual(s) or entity(ies) granting rights
under this Public License.
k. Share means to provide material to the public by any means or
process that requires permission under the Licensed Rights, such
as reproduction, public display, public performance, distribution,
dissemination, communication, or importation, and to make material
available to the public including in ways that members of the
public may access the material from a place and at a time
individually chosen by them.
l. Sui Generis Database Rights means rights other than copyright
resulting from Directive 96/9/EC of the European Parliament and of
the Council of 11 March 1996 on the legal protection of databases,
as amended and/or succeeded, as well as other essentially
equivalent rights anywhere in the world.
m. You means the individual or entity exercising the Licensed Rights
under this Public License. Your has a corresponding meaning.
Section 2 -- Scope.
a. License grant.
1. Subject to the terms and conditions of this Public License,
the Licensor hereby grants You a worldwide, royalty-free,
non-sublicensable, non-exclusive, irrevocable license to
exercise the Licensed Rights in the Licensed Material to:
a. reproduce and Share the Licensed Material, in whole or
in part; and
b. produce, reproduce, and Share Adapted Material.
2. Exceptions and Limitations. For the avoidance of doubt, where
Exceptions and Limitations apply to Your use, this Public
License does not apply, and You do not need to comply with
its terms and conditions.
3. Term. The term of this Public License is specified in Section
6(a).
4. Media and formats; technical modifications allowed. The
Licensor authorizes You to exercise the Licensed Rights in
all media and formats whether now known or hereafter created,
and to make technical modifications necessary to do so. The
Licensor waives and/or agrees not to assert any right or
authority to forbid You from making technical modifications
necessary to exercise the Licensed Rights, including
technical modifications necessary to circumvent Effective
Technological Measures. For purposes of this Public License,
simply making modifications authorized by this Section 2(a)
(4) never produces Adapted Material.
5. Downstream recipients.
a. Offer from the Licensor -- Licensed Material. Every
recipient of the Licensed Material automatically
receives an offer from the Licensor to exercise the
Licensed Rights under the terms and conditions of this
Public License.
b. Additional offer from the Licensor -- Adapted Material.
Every recipient of Adapted Material from You
automatically receives an offer from the Licensor to
exercise the Licensed Rights in the Adapted Material
under the conditions of the Adapter's License You apply.
c. No downstream restrictions. You may not offer or impose
any additional or different terms or conditions on, or
apply any Effective Technological Measures to, the
Licensed Material if doing so restricts exercise of the
Licensed Rights by any recipient of the Licensed
Material.
6. No endorsement. Nothing in this Public License constitutes or
may be construed as permission to assert or imply that You
are, or that Your use of the Licensed Material is, connected
with, or sponsored, endorsed, or granted official status by,
the Licensor or others designated to receive attribution as
provided in Section 3(a)(1)(A)(i).
b. Other rights.
1. Moral rights, such as the right of integrity, are not
licensed under this Public License, nor are publicity,
privacy, and/or other similar personality rights; however, to
the extent possible, the Licensor waives and/or agrees not to
assert any such rights held by the Licensor to the limited
extent necessary to allow You to exercise the Licensed
Rights, but not otherwise.
2. Patent and trademark rights are not licensed under this
Public License.
3. To the extent possible, the Licensor waives any right to
collect royalties from You for the exercise of the Licensed
Rights, whether directly or through a collecting society
under any voluntary or waivable statutory or compulsory
licensing scheme. In all other cases the Licensor expressly
reserves any right to collect such royalties.
Section 3 -- License Conditions.
Your exercise of the Licensed Rights is expressly made subject to the
following conditions.
a. Attribution.
1. If You Share the Licensed Material (including in modified
form), You must:
a. retain the following if it is supplied by the Licensor
with the Licensed Material:
i. identification of the creator(s) of the Licensed
Material and any others designated to receive
attribution, in any reasonable manner requested by
the Licensor (including by pseudonym if
designated);
ii. a copyright notice;
iii. a notice that refers to this Public License;
iv. a notice that refers to the disclaimer of
warranties;
v. a URI or hyperlink to the Licensed Material to the
extent reasonably practicable;
b. indicate if You modified the Licensed Material and
retain an indication of any previous modifications; and
c. indicate the Licensed Material is licensed under this
Public License, and include the text of, or the URI or
hyperlink to, this Public License.
2. You may satisfy the conditions in Section 3(a)(1) in any
reasonable manner based on the medium, means, and context in
which You Share the Licensed Material. For example, it may be
reasonable to satisfy the conditions by providing a URI or
hyperlink to a resource that includes the required
information.
3. If requested by the Licensor, You must remove any of the
information required by Section 3(a)(1)(A) to the extent
reasonably practicable.
b. ShareAlike.
In addition to the conditions in Section 3(a), if You Share
Adapted Material You produce, the following conditions also apply.
1. The Adapter's License You apply must be a Creative Commons
license with the same License Elements, this version or
later, or a BY-SA Compatible License.
2. You must include the text of, or the URI or hyperlink to, the
Adapter's License You apply. You may satisfy this condition
in any reasonable manner based on the medium, means, and
context in which You Share Adapted Material.
3. You may not offer or impose any additional or different terms
or conditions on, or apply any Effective Technological
Measures to, Adapted Material that restrict exercise of the
rights granted under the Adapter's License You apply.
Section 4 -- Sui Generis Database Rights.
Where the Licensed Rights include Sui Generis Database Rights that
apply to Your use of the Licensed Material:
a. for the avoidance of doubt, Section 2(a)(1) grants You the right
to extract, reuse, reproduce, and Share all or a substantial
portion of the contents of the database;
b. if You include all or a substantial portion of the database
contents in a database in which You have Sui Generis Database
Rights, then the database in which You have Sui Generis Database
Rights (but not its individual contents) is Adapted Material,
including for purposes of Section 3(b); and
c. You must comply with the conditions in Section 3(a) if You Share
all or a substantial portion of the contents of the database.
For the avoidance of doubt, this Section 4 supplements and does not
replace Your obligations under this Public License where the Licensed
Rights include other Copyright and Similar Rights.
Section 5 -- Disclaimer of Warranties and Limitation of Liability.
a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
c. The disclaimer of warranties and limitation of liability provided
above shall be interpreted in a manner that, to the extent
possible, most closely approximates an absolute disclaimer and
waiver of all liability.
Section 6 -- Term and Termination.
a. This Public License applies for the term of the Copyright and
Similar Rights licensed here. However, if You fail to comply with
this Public License, then Your rights under this Public License
terminate automatically.
b. Where Your right to use the Licensed Material has terminated under
Section 6(a), it reinstates:
1. automatically as of the date the violation is cured, provided
it is cured within 30 days of Your discovery of the
violation; or
2. upon express reinstatement by the Licensor.
For the avoidance of doubt, this Section 6(b) does not affect any
right the Licensor may have to seek remedies for Your violations
of this Public License.
c. For the avoidance of doubt, the Licensor may also offer the
Licensed Material under separate terms or conditions or stop
distributing the Licensed Material at any time; however, doing so
will not terminate this Public License.
d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
License.
Section 7 -- Other Terms and Conditions.
a. The Licensor shall not be bound by any additional or different
terms or conditions communicated by You unless expressly agreed.
b. Any arrangements, understandings, or agreements regarding the
Licensed Material not stated herein are separate from and
independent of the terms and conditions of this Public License.
Section 8 -- Interpretation.
a. For the avoidance of doubt, this Public License does not, and
shall not be interpreted to, reduce, limit, restrict, or impose
conditions on any use of the Licensed Material that could lawfully
be made without permission under this Public License.
b. To the extent possible, if any provision of this Public License is
deemed unenforceable, it shall be automatically reformed to the
minimum extent necessary to make it enforceable. If the provision
cannot be reformed, it shall be severed from this Public License
without affecting the enforceability of the remaining terms and
conditions.
c. No term or condition of this Public License will be waived and no
failure to comply consented to unless expressly agreed to by the
Licensor.
d. Nothing in this Public License constitutes or may be interpreted
as a limitation upon, or waiver of, any privileges and immunities
that apply to the Licensor or You, including from the legal
processes of any jurisdiction or authority.
=======================================================================
Creative Commons is not a party to its public
licenses. Notwithstanding, Creative Commons may elect to apply one of
its public licenses to material it publishes and in those instances
will be considered the “Licensor.” The text of the Creative Commons
public licenses is dedicated to the public domain under the CC0 Public
Domain Dedication. Except for the limited purpose of indicating that
material is shared under a Creative Commons public license or as
otherwise permitted by the Creative Commons policies published at
creativecommons.org/policies, Creative Commons does not authorize the
use of the trademark "Creative Commons" or any other trademark or logo
of Creative Commons without its prior written consent including,
without limitation, in connection with any unauthorized modifications
to any of its public licenses or any other arrangements,
understandings, or agreements concerning use of licensed material. For
the avoidance of doubt, this paragraph does not form part of the
public licenses.
Creative Commons may be contacted at creativecommons.org.

View File

@ -0,0 +1,34 @@
## Examples of textcat training data
spacy JSON training files were generated from JSONL with:
```
python textcatjsonl_to_trainjson.py -m en file.jsonl .
```
`cooking.json` is an example with mutually-exclusive classes with two labels:
* `baking`
* `not_baking`
`jigsaw-toxic-comment.json` is an example with multiple labels per instance:
* `insult`
* `obscene`
* `severe_toxic`
* `toxic`
### Data Sources
* `cooking.jsonl`: https://cooking.stackexchange.com. The meta IDs link to the
original question as `https://cooking.stackexchange.com/questions/ID`, e.g.,
`https://cooking.stackexchange.com/questions/2` for the first instance.
* `jigsaw-toxic-comment.jsonl`: [Jigsaw Toxic Comments Classification
Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
### Data Licenses
* `cooking.jsonl`: CC BY-SA 4.0 ([`CC_BY-SA-4.0.txt`](CC_BY-SA-4.0.txt))
* `jigsaw-toxic-comment.jsonl`:
* text: CC BY-SA 3.0 ([`CC_BY-SA-3.0.txt`](CC_BY-SA-3.0.txt))
* annotation: CC0 ([`CC0.txt`](CC0.txt))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "2"}, "text": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven by laying the strips out on a cookie sheet. When using this method, how long should I cook the bacon for, and at what temperature?\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "3"}, "text": "What is the difference between white and brown eggs?\nI always use brown extra large eggs, but I can't honestly say why I do this other than habit at this point. Are there any distinct advantages or disadvantages like flavor, shelf life, etc?\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "4"}, "text": "What is the difference between baking soda and baking powder?\nAnd can I use one in place of the other in certain recipes?\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "5"}, "text": "In a tomato sauce recipe, how can I cut the acidity?\nIt seems that every time I make a tomato sauce for pasta, the sauce is a little bit too acid for my taste. I've tried using sugar or sodium bicarbonate, but I'm not satisfied with the results.\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "6"}, "text": "What ingredients (available in specific regions) can I substitute for parsley?\nI have a recipe that calls for fresh parsley. I have substituted other fresh herbs for their dried equivalents but I don't have fresh or dried parsley. Is there something else (ex another dried herb) that I can use instead of parsley?\nI know it is used mainly for looks rather than taste but I have a pasta recipe that calls for 2 tablespoons of parsley in the sauce and then another 2 tablespoons on top when it is done. I know the parsley on top is more for looks but there must be something about the taste otherwise it would call for parsley within the sauce as well.\nI would especially like to hear about substitutes available in Southeast Asia and other parts of the world where the obvious answers (such as cilantro) are not widely available.\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "9"}, "text": "What is the internal temperature a steak should be cooked to for Rare/Medium Rare/Medium/Well?\nI'd like to know when to take my steaks off the grill and please everybody.\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "11"}, "text": "How should I poach an egg?\nWhat's the best method to poach an egg without it turning into an eggy soupy mess?\n"}
{"cats": {"baking": 0.0, "not_baking": 1.0}, "meta": {"id": "12"}, "text": "How can I make my Ice Cream \"creamier\"\nMy ice cream doesn't feel creamy enough. I got the recipe from Good Eats, and I can't tell if it's just the recipe or maybe that I'm just not getting my \"batter\" cold enough before I try to make it (I let it chill overnight in the refrigerator, but it doesn't always come out of the machine looking like \"soft serve\" as he said on the show - it's usually a little thinner).\nRecipe: http://www.foodnetwork.com/recipes/alton-brown/serious-vanilla-ice-cream-recipe/index.html\nThanks!\n"}
{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "17"}, "text": "How long and at what temperature do the various parts of a chicken need to be cooked?\nI'm interested in baking thighs, legs, breasts and wings. How long do each of these items need to bake and at what temperature?\n"}
{"cats": {"baking": 1.0, "not_baking": 0.0}, "meta": {"id": "27"}, "text": "Do I need to sift flour that is labeled sifted?\nIs there really an advantage to sifting flour that I bought that was labeled 'sifted'?\n"}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,10 @@
{"meta": {"id": "0000997932d777bf"}, "text": "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "001956c382006abd"}, "text": "I'm Sorry \n\nI'm sorry I screwed around with someones talk page. It was very bad to do. I know how having the templates on their talk page helps you assert your dominance over them. I know I should bow down to the almighty administrators. But then again, I'm going to go play outside....with your mom. 76.122.79.82", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}}
{"meta": {"id": "0020e7119b96eeeb"}, "text": "Stupid peace of shit stop deleting my stuff asshole go die and fall in a hole go to hell!", "cats": {"insult": 1, "obscene": 1, "severe_toxic": 1, "toxic": 1}}
{"meta": {"id": "000103f0d9cfb60f"}, "text": "D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "001dc38a83d420cf"}, "text": "GET FUCKED UP. GET FUCKEEED UP. GOT A DRINK THAT YOU CANT PUT DOWN???/ GET FUCK UP GET FUCKED UP. I'M FUCKED UP RIGHT NOW!", "cats": {"insult": 0, "obscene": 1, "severe_toxic": 0, "toxic": 1}}
{"meta": {"id": "000113f07ec002fd"}, "text": "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "0001b41b1c6bb37e"}, "text": "\"\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of \"\"types of accidents\"\" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It's listed in the relevant form eg Wikipedia:Good_article_nominations#Transport \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "0001d958c54c6e35"}, "text": "You, sir, are my hero. Any chance you remember what page that's on?", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "00025465d4725e87"}, "text": "\"\n\nCongratulations from me as well, use the tools well.  · talk \"", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 0}}
{"meta": {"id": "002264ea4d5f2887"}, "text": "Why can't you believe how fat Artie is? Did you see him on his recent appearence on the Tonight Show with Jay Leno? He looks absolutely AWFUL! If I had to put money on it, I'd say that Artie Lange is a can't miss candidate for the 2007 Dead pool! \n\n \nKindly keep your malicious fingers off of my above comment, . Everytime you remove it, I will repost it!!!", "cats": {"insult": 0, "obscene": 0, "severe_toxic": 0, "toxic": 1}}

View File

@ -0,0 +1,53 @@
from pathlib import Path
import plac
import spacy
from spacy.gold import docs_to_json
import srsly
import sys
@plac.annotations(
model=("Model name. Defaults to 'en'.", "option", "m", str),
input_file=("Input file (jsonl)", "positional", None, Path),
output_dir=("Output directory", "positional", None, Path),
n_texts=("Number of texts to convert", "option", "t", int),
)
def convert(model='en', input_file=None, output_dir=None, n_texts=0):
# Load model with tokenizer + sentencizer only
nlp = spacy.load(model)
nlp.disable_pipes(*nlp.pipe_names)
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer, first=True)
texts = []
cats = []
count = 0
if not input_file.exists():
print("Input file not found:", input_file)
sys.exit(1)
else:
with open(input_file) as fileh:
for line in fileh:
data = srsly.json_loads(line)
texts.append(data["text"])
cats.append(data["cats"])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
else:
output_dir = Path(".")
docs = []
for i, doc in enumerate(nlp.pipe(texts)):
doc.cats = cats[i]
docs.append(doc)
if n_texts > 0 and count == n_texts:
break
count += 1
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
if __name__ == "__main__":
plac.call(convert)

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Training: https://spacy.io/usage/training * Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy vX.X Compatible with: spaCy v2.2
Last tested with: vX.X Last tested with: v2.2
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -1,8 +1,8 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0 preshed>=3.0.0,<3.1.0
thinc>=7.0.8,<7.1.0 thinc>=7.1.1,<7.2.0
blis>=0.2.2,<0.3.0 blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0 wasabi>=0.2.0,<1.1.0
srsly>=0.1.0,<1.1.0 srsly>=0.1.0,<1.1.0

View File

@ -43,6 +43,7 @@ MOD_NAMES = [
"spacy.kb", "spacy.kb",
"spacy.morphology", "spacy.morphology",
"spacy.pipeline.pipes", "spacy.pipeline.pipes",
"spacy.pipeline.morphologizer",
"spacy.syntax.stateclass", "spacy.syntax.stateclass",
"spacy.syntax._state", "spacy.syntax._state",
"spacy.tokenizer", "spacy.tokenizer",
@ -56,6 +57,7 @@ MOD_NAMES = [
"spacy.tokens.doc", "spacy.tokens.doc",
"spacy.tokens.span", "spacy.tokens.span",
"spacy.tokens.token", "spacy.tokens.token",
"spacy.tokens.morphanalysis",
"spacy.tokens._retokenize", "spacy.tokens._retokenize",
"spacy.matcher.matcher", "spacy.matcher.matcher",
"spacy.matcher.phrasematcher", "spacy.matcher.phrasematcher",
@ -245,9 +247,9 @@ def setup_package():
"numpy>=1.15.0", "numpy>=1.15.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0", "preshed>=3.0.0,<3.1.0",
"thinc>=7.0.8,<7.1.0", "thinc>=7.1.1,<7.2.0",
"blis>=0.2.2,<0.3.0", "blis>=0.4.0,<0.5.0",
"plac<1.0.0,>=0.9.6", "plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0", "requests>=2.13.0,<3.0.0",
"wasabi>=0.2.0,<1.1.0", "wasabi>=0.2.0,<1.1.0",
@ -281,7 +283,6 @@ def setup_package():
"Programming Language :: Python :: 2", "Programming Language :: Python :: 2",
"Programming Language :: Python :: 2.7", "Programming Language :: Python :: 2.7",
"Programming Language :: Python :: 3", "Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.4",
"Programming Language :: Python :: 3.5", "Programming Language :: Python :: 3.5",
"Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.7",

View File

@ -15,7 +15,7 @@ from thinc.api import uniqued, wrap, noop
from thinc.api import with_square_sequences from thinc.api import with_square_sequences
from thinc.linear.linear import LinearModel from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module from thinc.neural.util import get_array_module, copy_array
from thinc.neural.optimizers import Adam from thinc.neural.optimizers import Adam
from thinc import describe from thinc import describe
@ -286,10 +286,7 @@ def link_vectors_to_models(vocab):
if vectors.name is None: if vectors.name is None:
vectors.name = VECTORS_KEY vectors.name = VECTORS_KEY
if vectors.data.size != 0: if vectors.data.size != 0:
print( user_warning(Warnings.W020.format(shape=vectors.data.shape))
"Warning: Unnamed vectors -- this won't allow multiple vectors "
"models to be loaded. (Shape: (%d, %d))" % vectors.data.shape
)
ops = Model.ops ops = Model.ops
for word in vocab: for word in vocab:
if word.orth in vectors.key2row: if word.orth in vectors.key2row:
@ -323,6 +320,9 @@ def Tok2Vec(width, embed_size, **kwargs):
pretrained_vectors = kwargs.get("pretrained_vectors", None) pretrained_vectors = kwargs.get("pretrained_vectors", None)
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3) cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
subword_features = kwargs.get("subword_features", True) subword_features = kwargs.get("subword_features", True)
char_embed = kwargs.get("char_embed", False)
if char_embed:
subword_features = False
conv_depth = kwargs.get("conv_depth", 4) conv_depth = kwargs.get("conv_depth", 4)
bilstm_depth = kwargs.get("bilstm_depth", 0) bilstm_depth = kwargs.get("bilstm_depth", 0)
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH] cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
@ -362,6 +362,14 @@ def Tok2Vec(width, embed_size, **kwargs):
>> LN(Maxout(width, width * 4, pieces=3)), >> LN(Maxout(width, width * 4, pieces=3)),
column=cols.index(ORTH), column=cols.index(ORTH),
) )
elif char_embed:
embed = concatenate_lists(
CharacterEmbed(nM=64, nC=8),
FeatureExtracter(cols) >> with_flatten(norm),
)
reduce_dimensions = LN(
Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
)
else: else:
embed = norm embed = norm
@ -369,9 +377,15 @@ def Tok2Vec(width, embed_size, **kwargs):
ExtractWindow(nW=1) ExtractWindow(nW=1)
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
) )
if char_embed:
tok2vec = embed >> with_flatten(
reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
)
else:
tok2vec = FeatureExtracter(cols) >> with_flatten( tok2vec = FeatureExtracter(cols) >> with_flatten(
embed >> convolution ** conv_depth, pad=conv_depth embed >> convolution ** conv_depth, pad=conv_depth
) )
if bilstm_depth >= 1: if bilstm_depth >= 1:
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth) tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
# Work around thinc API limitations :(. TODO: Revise in Thinc 7 # Work around thinc API limitations :(. TODO: Revise in Thinc 7
@ -504,6 +518,46 @@ def getitem(i):
return layerize(getitem_fwd) return layerize(getitem_fwd)
@describe.attributes(
W=Synapses("Weights matrix", lambda obj: (obj.nO, obj.nI), lambda W, ops: None)
)
class MultiSoftmax(Affine):
"""Neural network layer that predicts several multi-class attributes at once.
For instance, we might predict one class with 6 variables, and another with 5.
We predict the 11 neurons required for this, and then softmax them such
that columns 0-6 make a probability distribution and coumns 6-11 make another.
"""
name = "multisoftmax"
def __init__(self, out_sizes, nI=None, **kwargs):
Model.__init__(self, **kwargs)
self.out_sizes = out_sizes
self.nO = sum(out_sizes)
self.nI = nI
def predict(self, input__BI):
output__BO = self.ops.affine(self.W, self.b, input__BI)
i = 0
for out_size in self.out_sizes:
self.ops.softmax(output__BO[:, i : i + out_size], inplace=True)
i += out_size
return output__BO
def begin_update(self, input__BI, drop=0.0):
output__BO = self.predict(input__BI)
def finish_update(grad__BO, sgd=None):
self.d_W += self.ops.gemm(grad__BO, input__BI, trans1=True)
self.d_b += grad__BO.sum(axis=0)
grad__BI = self.ops.gemm(grad__BO, self.W)
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return grad__BI
return output__BO, finish_update
def build_tagger_model(nr_class, **cfg): def build_tagger_model(nr_class, **cfg):
embed_size = util.env_opt("embed_size", 2000) embed_size = util.env_opt("embed_size", 2000)
if "token_vector_width" in cfg: if "token_vector_width" in cfg:
@ -530,6 +584,33 @@ def build_tagger_model(nr_class, **cfg):
return model return model
def build_morphologizer_model(class_nums, **cfg):
embed_size = util.env_opt("embed_size", 7000)
if "token_vector_width" in cfg:
token_vector_width = cfg["token_vector_width"]
else:
token_vector_width = util.env_opt("token_vector_width", 128)
pretrained_vectors = cfg.get("pretrained_vectors")
char_embed = cfg.get("char_embed", True)
with Model.define_operators({">>": chain, "+": add, "**": clone}):
if "tok2vec" in cfg:
tok2vec = cfg["tok2vec"]
else:
tok2vec = Tok2Vec(
token_vector_width,
embed_size,
char_embed=char_embed,
pretrained_vectors=pretrained_vectors,
)
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
softmax.out_sizes = class_nums
model = tok2vec >> softmax
model.nI = None
model.tok2vec = tok2vec
model.softmax = softmax
return model
@layerize @layerize
def SpacyVectors(docs, drop=0.0): def SpacyVectors(docs, drop=0.0):
batch = [] batch = []
@ -720,6 +801,7 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
concat = concatenate(*layers) concat = concatenate(*layers)
def concatenate_lists_fwd(Xs, drop=0.0): def concatenate_lists_fwd(Xs, drop=0.0):
if drop is not None:
drop *= drop_factor drop *= drop_factor
lengths = ops.asarray([len(X) for X in Xs], dtype="i") lengths = ops.asarray([len(X) for X in Xs], dtype="i")
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop) flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
@ -810,6 +892,67 @@ def _replace_word(word, random_words, mask="[MASK]"):
return word return word
def _uniform_init(lo, hi):
def wrapped(W, ops):
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
return wrapped
@describe.attributes(
nM=Dimension("Vector dimensions"),
nC=Dimension("Number of characters per word"),
vectors=Synapses(
"Embed matrix", lambda obj: (obj.nC, obj.nV, obj.nM), _uniform_init(-0.1, 0.1)
),
d_vectors=Gradient("vectors"),
)
class CharacterEmbed(Model):
def __init__(self, nM=None, nC=None, **kwargs):
Model.__init__(self, **kwargs)
self.nM = nM
self.nC = nC
@property
def nO(self):
return self.nM * self.nC
@property
def nV(self):
return 256
def begin_update(self, docs, drop=0.0):
if not docs:
return []
ids = []
output = []
weights = self.vectors
# This assists in indexing; it's like looping over this dimension.
# Still consider this weird witch craft...But thanks to Mark Neumann
# for the tip.
nCv = self.ops.xp.arange(self.nC)
for doc in docs:
doc_ids = doc.to_utf8_array(nr_char=self.nC)
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
# incantation do I chant to get
# output[i, j, k] == data[j, ids[i, j], k]?
doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
output.append(doc_vectors.reshape((len(doc), self.nO)))
ids.append(doc_ids)
def backprop_character_embed(d_vectors, sgd=None):
gradient = self.d_vectors
for doc_ids, d_doc_vectors in zip(ids, d_vectors):
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
if sgd is not None:
sgd(self._mem.weights, self._mem.gradient, key=self.id)
return None
return output, backprop_character_embed
def get_cossim_loss(yh, y): def get_cossim_loss(yh, y):
# Add a small constant to avoid 0 vectors # Add a small constant to avoid 0 vectors
yh = yh + 1e-8 yh = yh + 1e-8

View File

@ -1,16 +1,12 @@
# inspired from:
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.8" __version__ = "2.2.0.dev8"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) in Python"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion"
__email__ = "contact@explosion.ai" __email__ = "contact@explosion.ai"
__license__ = "MIT" __license__ = "MIT"
__release__ = True __release__ = False
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -144,8 +144,12 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False):
for name, value in stringy_attrs.items(): for name, value in stringy_attrs.items():
if isinstance(name, int): if isinstance(name, int):
int_key = name int_key = name
else: elif name in IDS:
int_key = IDS[name]
elif name.upper() in IDS:
int_key = IDS[name.upper()] int_key = IDS[name.upper()]
else:
continue
if strings_map is not None and isinstance(value, basestring): if strings_map is not None and isinstance(value, basestring):
if hasattr(strings_map, 'add'): if hasattr(strings_map, 'add'):
value = strings_map.add(value) value = strings_map.add(value)

View File

@ -34,12 +34,6 @@ BLANK_MODEL_THRESHOLD = 2000
str, str,
), ),
ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool), ignore_warnings=("Ignore warnings, only show stats and errors", "flag", "IW", bool),
ignore_validation=(
"Don't exit if JSON format validation fails",
"flag",
"IV",
bool,
),
verbose=("Print additional information and explanations", "flag", "V", bool), verbose=("Print additional information and explanations", "flag", "V", bool),
no_format=("Don't pretty-print the results", "flag", "NF", bool), no_format=("Don't pretty-print the results", "flag", "NF", bool),
) )
@ -50,10 +44,14 @@ def debug_data(
base_model=None, base_model=None,
pipeline="tagger,parser,ner", pipeline="tagger,parser,ner",
ignore_warnings=False, ignore_warnings=False,
ignore_validation=False,
verbose=False, verbose=False,
no_format=False, no_format=False,
): ):
"""
Analyze, debug and validate your training and development data, get useful
stats, and find problems like invalid entity annotations, cyclic
dependencies, low data labels and more.
"""
msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings) msg = Printer(pretty=not no_format, ignore_warnings=ignore_warnings)
# Make sure all files and paths exists if they are needed # Make sure all files and paths exists if they are needed
@ -72,21 +70,9 @@ def debug_data(
msg.divider("Data format validation") msg.divider("Data format validation")
# Validate data format using the JSON schema # TODO: Validate data format using the JSON schema
# TODO: update once the new format is ready # TODO: update once the new format is ready
# TODO: move validation to GoldCorpus in order to be able to load from dir # TODO: move validation to GoldCorpus in order to be able to load from dir
train_data_errors = [] # TODO: validate_json
dev_data_errors = [] # TODO: validate_json
if not train_data_errors:
msg.good("Training data JSON format is valid")
if not dev_data_errors:
msg.good("Development data JSON format is valid")
for error in train_data_errors:
msg.fail("Training data: {}".format(error))
for error in dev_data_errors:
msg.fail("Develoment data: {}".format(error))
if (train_data_errors or dev_data_errors) and not ignore_validation:
sys.exit(1)
# Create the gold corpus to be able to better analyze data # Create the gold corpus to be able to better analyze data
loading_train_error_message = "" loading_train_error_message = ""
@ -284,7 +270,7 @@ def debug_data(
if "textcat" in pipeline: if "textcat" in pipeline:
msg.divider("Text Classification") msg.divider("Text Classification")
labels = [label for label in gold_train_data["textcat"]] labels = [label for label in gold_train_data["cats"]]
model_labels = _get_labels_from_model(nlp, "textcat") model_labels = _get_labels_from_model(nlp, "textcat")
new_labels = [l for l in labels if l not in model_labels] new_labels = [l for l in labels if l not in model_labels]
existing_labels = [l for l in labels if l in model_labels] existing_labels = [l for l in labels if l in model_labels]
@ -295,13 +281,45 @@ def debug_data(
) )
if new_labels: if new_labels:
labels_with_counts = _format_labels( labels_with_counts = _format_labels(
gold_train_data["textcat"].most_common(), counts=True gold_train_data["cats"].most_common(), counts=True
) )
msg.text("New: {}".format(labels_with_counts), show=verbose) msg.text("New: {}".format(labels_with_counts), show=verbose)
if existing_labels: if existing_labels:
msg.text( msg.text(
"Existing: {}".format(_format_labels(existing_labels)), show=verbose "Existing: {}".format(_format_labels(existing_labels)), show=verbose
) )
if set(gold_train_data["cats"]) != set(gold_dev_data["cats"]):
msg.fail(
"The train and dev labels are not the same. "
"Train labels: {}. "
"Dev labels: {}.".format(
_format_labels(gold_train_data["cats"]),
_format_labels(gold_dev_data["cats"]),
)
)
if gold_train_data["n_cats_multilabel"] > 0:
msg.info(
"The train data contains instances without "
"mutually-exclusive classes. Use '--textcat-multilabel' "
"when training."
)
if gold_dev_data["n_cats_multilabel"] == 0:
msg.warn(
"Potential train/dev mismatch: the train data contains "
"instances without mutually-exclusive classes while the "
"dev data does not."
)
else:
msg.info(
"The train data contains only instances with "
"mutually-exclusive classes."
)
if gold_dev_data["n_cats_multilabel"] > 0:
msg.fail(
"Train/dev mismatch: the dev data contains instances "
"without mutually-exclusive classes while the train data "
"contains only instances with mutually-exclusive classes."
)
if "tagger" in pipeline: if "tagger" in pipeline:
msg.divider("Part-of-speech Tagging") msg.divider("Part-of-speech Tagging")
@ -518,6 +536,7 @@ def _compile_gold(train_docs, pipeline):
"n_sents": 0, "n_sents": 0,
"n_nonproj": 0, "n_nonproj": 0,
"n_cycles": 0, "n_cycles": 0,
"n_cats_multilabel": 0,
"texts": set(), "texts": set(),
} }
for doc, gold in train_docs: for doc, gold in train_docs:
@ -540,6 +559,8 @@ def _compile_gold(train_docs, pipeline):
data["ner"]["-"] += 1 data["ner"]["-"] += 1
if "textcat" in pipeline: if "textcat" in pipeline:
data["cats"].update(gold.cats) data["cats"].update(gold.cats)
if list(gold.cats.values()).count(1.0) != 1:
data["n_cats_multilabel"] += 1
if "tagger" in pipeline: if "tagger" in pipeline:
data["tags"].update([x for x in gold.tags if x is not None]) data["tags"].update([x for x in gold.tags if x is not None])
if "parser" in pipeline: if "parser" in pipeline:

View File

@ -28,6 +28,16 @@ def download(model, direct=False, *pip_args):
can be shortcut, model name or, if --direct flag is set, full model name can be shortcut, model name or, if --direct flag is set, full model name
with version. For direct downloads, the compatibility check will be skipped. with version. For direct downloads, the compatibility check will be skipped.
""" """
if not require_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed "
"(maybe because you've built from source?), so installing the "
"model dependencies would cause spaCy to be downloaded, which "
"probably isn't what you want. If the model package has other "
"dependencies, you'll have to install them manually."
)
pip_args = pip_args + ("--no-deps",)
dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}" dl_tpl = "{m}-{v}/{m}-{v}.tar.gz#egg={m}=={v}"
if direct: if direct:
components = model.split("-") components = model.split("-")
@ -72,12 +82,15 @@ def download(model, direct=False, *pip_args):
# is_package check currently fails, because pkg_resources.working_set # is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work # is not refreshed automatically (see #3923). We're trying to work
# around this here be requiring the package explicitly. # around this here be requiring the package explicitly.
require_package(model_name)
def require_package(name):
try: try:
pkg_resources.working_set.require(model_name) pkg_resources.working_set.require(name)
return True
except: # noqa: E722 except: # noqa: E722
# Maybe it's possible to remove this mostly worried about cross- return False
# platform and cross-Python copmpatibility here
pass
def get_json(url, desc): def get_json(url, desc):
@ -117,7 +130,7 @@ def get_version(model, comp):
def download_model(filename, user_pip_args=None): def download_model(filename, user_pip_args=None):
download_url = about.__download_url__ + "/" + filename download_url = about.__download_url__ + "/" + filename
pip_args = ["--no-cache-dir", "--no-deps"] pip_args = ["--no-cache-dir"]
if user_pip_args: if user_pip_args:
pip_args.extend(user_pip_args) pip_args.extend(user_pip_args)
cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url] cmd = [sys.executable, "-m", "pip", "install"] + pip_args + [download_url]

View File

@ -61,6 +61,7 @@ def evaluate(
"NER P": "%.2f" % scorer.ents_p, "NER P": "%.2f" % scorer.ents_p,
"NER R": "%.2f" % scorer.ents_r, "NER R": "%.2f" % scorer.ents_r,
"NER F": "%.2f" % scorer.ents_f, "NER F": "%.2f" % scorer.ents_f,
"Textcat": "%.2f" % scorer.textcat_score,
} }
msg.table(results, title="Results") msg.table(results, title="Results")

View File

@ -21,54 +21,40 @@ from .. import about
@plac.annotations( @plac.annotations(
# fmt: off
lang=("Model language", "positional", None, str), lang=("Model language", "positional", None, str),
output_path=("Output directory to store model in", "positional", None, Path), output_path=("Output directory to store model in", "positional", None, Path),
train_path=("Location of JSON-formatted training data", "positional", None, Path), train_path=("Location of JSON-formatted training data", "positional", None, Path),
dev_path=("Location of JSON-formatted development data", "positional", None, Path), dev_path=("Location of JSON-formatted development data", "positional", None, Path),
raw_text=( raw_text=("Path to jsonl file with unlabelled text documents.", "option", "rt", Path),
"Path to jsonl file with unlabelled text documents.",
"option",
"rt",
Path,
),
base_model=("Name of model to update (optional)", "option", "b", str), base_model=("Name of model to update (optional)", "option", "b", str),
pipeline=("Comma-separated names of pipeline components", "option", "p", str), pipeline=("Comma-separated names of pipeline components", "option", "p", str),
vectors=("Model to load vectors from", "option", "v", str), vectors=("Model to load vectors from", "option", "v", str),
n_iter=("Number of iterations", "option", "n", int), n_iter=("Number of iterations", "option", "n", int),
n_early_stopping=( n_early_stopping=("Maximum number of training epochs without dev accuracy improvement", "option", "ne", int),
"Maximum number of training epochs without dev accuracy improvement",
"option",
"ne",
int,
),
n_examples=("Number of examples", "option", "ns", int), n_examples=("Number of examples", "option", "ns", int),
use_gpu=("Use GPU", "option", "g", int), use_gpu=("Use GPU", "option", "g", int),
version=("Model version", "option", "V", str), version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json to use as base.", "option", "m", Path), meta_path=("Optional path to meta.json to use as base.", "option", "m", Path),
init_tok2vec=( init_tok2vec=("Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", "option", "t2v", Path),
"Path to pretrained weights for the token-to-vector parts of the models. See 'spacy pretrain'. Experimental.", parser_multitasks=("Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'", "option", "pt", str),
"option", entity_multitasks=("Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'", "option", "et", str),
"t2v",
Path,
),
parser_multitasks=(
"Side objectives for parser CNN, e.g. 'dep' or 'dep,tag'",
"option",
"pt",
str,
),
entity_multitasks=(
"Side objectives for NER CNN, e.g. 'dep' or 'dep,tag'",
"option",
"et",
str,
),
noise_level=("Amount of corruption for data augmentation", "option", "nl", float), noise_level=("Amount of corruption for data augmentation", "option", "nl", float),
orth_variant_level=(
"Amount of orthography variation for data augmentation",
"option",
"ovl",
float,
),
eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str), eval_beam_widths=("Beam widths to evaluate, e.g. 4,8", "option", "bw", str),
gold_preproc=("Use gold preprocessing", "flag", "G", bool), gold_preproc=("Use gold preprocessing", "flag", "G", bool),
learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool), learn_tokens=("Make parser learn gold-standard tokenization", "flag", "T", bool),
textcat_multilabel=("Textcat classes aren't mutually exclusive (multilabel)", "flag", "TML", bool),
textcat_arch=("Textcat model architecture", "option", "ta", str),
textcat_positive_label=("Textcat positive label for binary classes with two labels", "option", "tpl", str),
verbose=("Display more information for debug", "flag", "VV", bool), verbose=("Display more information for debug", "flag", "VV", bool),
debug=("Run data diagnostics before training", "flag", "D", bool), debug=("Run data diagnostics before training", "flag", "D", bool),
# fmt: on
) )
def train( def train(
lang, lang,
@ -89,9 +75,13 @@ def train(
parser_multitasks="", parser_multitasks="",
entity_multitasks="", entity_multitasks="",
noise_level=0.0, noise_level=0.0,
orth_variant_level=0.0,
eval_beam_widths="", eval_beam_widths="",
gold_preproc=False, gold_preproc=False,
learn_tokens=False, learn_tokens=False,
textcat_multilabel=False,
textcat_arch="bow",
textcat_positive_label=None,
verbose=False, verbose=False,
debug=False, debug=False,
): ):
@ -177,9 +167,37 @@ def train(
if pipe not in nlp.pipe_names: if pipe not in nlp.pipe_names:
if pipe == "parser": if pipe == "parser":
pipe_cfg = {"learn_tokens": learn_tokens} pipe_cfg = {"learn_tokens": learn_tokens}
elif pipe == "textcat":
pipe_cfg = {
"exclusive_classes": not textcat_multilabel,
"architecture": textcat_arch,
"positive_label": textcat_positive_label,
}
else: else:
pipe_cfg = {} pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
else:
if pipe == "textcat":
textcat_cfg = nlp.get_pipe("textcat").cfg
base_cfg = {
"exclusive_classes": textcat_cfg["exclusive_classes"],
"architecture": textcat_cfg["architecture"],
"positive_label": textcat_cfg["positive_label"],
}
pipe_cfg = {
"exclusive_classes": not textcat_multilabel,
"architecture": textcat_arch,
"positive_label": textcat_positive_label,
}
if base_cfg != pipe_cfg:
msg.fail(
"The base textcat model configuration does"
"not match the provided training options. "
"Existing cfg: {}, provided cfg: {}".format(
base_cfg, pipe_cfg
),
exits=1,
)
else: else:
msg.text("Starting with blank model '{}'".format(lang)) msg.text("Starting with blank model '{}'".format(lang))
lang_cls = util.get_lang_class(lang) lang_cls = util.get_lang_class(lang)
@ -187,6 +205,12 @@ def train(
for pipe in pipeline: for pipe in pipeline:
if pipe == "parser": if pipe == "parser":
pipe_cfg = {"learn_tokens": learn_tokens} pipe_cfg = {"learn_tokens": learn_tokens}
elif pipe == "textcat":
pipe_cfg = {
"exclusive_classes": not textcat_multilabel,
"architecture": textcat_arch,
"positive_label": textcat_positive_label,
}
else: else:
pipe_cfg = {} pipe_cfg = {}
nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg)) nlp.add_pipe(nlp.create_pipe(pipe, config=pipe_cfg))
@ -227,12 +251,89 @@ def train(
components = _load_pretrained_tok2vec(nlp, init_tok2vec) components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components)) msg.text("Loaded pretrained tok2vec for: {}".format(components))
# Verify textcat config
if "textcat" in pipeline:
textcat_labels = nlp.get_pipe("textcat").cfg["labels"]
if textcat_positive_label and textcat_positive_label not in textcat_labels:
msg.fail(
"The textcat_positive_label (tpl) '{}' does not match any "
"label in the training data.".format(textcat_positive_label),
exits=1,
)
if textcat_positive_label and len(textcat_labels) != 2:
msg.fail(
"A textcat_positive_label (tpl) '{}' was provided for training "
"data that does not appear to be a binary classification "
"problem with two labels.".format(textcat_positive_label),
exits=1,
)
train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
)
train_labels = set()
if textcat_multilabel:
multilabel_found = False
for text, gold in train_docs:
train_labels.update(gold.cats.keys())
if list(gold.cats.values()).count(1.0) != 1:
multilabel_found = True
if not multilabel_found and not base_model:
msg.warn(
"The textcat training instances look like they have "
"mutually-exclusive classes. Remove the flag "
"'--textcat-multilabel' to train a classifier with "
"mutually-exclusive classes."
)
if not textcat_multilabel:
for text, gold in train_docs:
train_labels.update(gold.cats.keys())
if list(gold.cats.values()).count(1.0) != 1 and not base_model:
msg.warn(
"Some textcat training instances do not have exactly "
"one positive label. Modifying training options to "
"include the flag '--textcat-multilabel' for classes "
"that are not mutually exclusive."
)
nlp.get_pipe("textcat").cfg["exclusive_classes"] = False
textcat_multilabel = True
break
if base_model and set(textcat_labels) != train_labels:
msg.fail(
"Cannot extend textcat model using data with different "
"labels. Base model labels: {}, training data labels: "
"{}.".format(textcat_labels, list(train_labels)),
exits=1,
)
if textcat_multilabel:
msg.text(
"Textcat evaluation score: ROC AUC score macro-averaged across "
"the labels '{}'".format(", ".join(textcat_labels))
)
elif textcat_positive_label and len(textcat_labels) == 2:
msg.text(
"Textcat evaluation score: F1-score for the "
"label '{}'".format(textcat_positive_label)
)
elif len(textcat_labels) > 1:
if len(textcat_labels) == 2:
msg.warn(
"If the textcat component is a binary classifier with "
"exclusive classes, provide '--textcat_positive_label' for "
"an evaluation on the positive class."
)
msg.text(
"Textcat evaluation score: F1-score macro-averaged across "
"the labels '{}'".format(", ".join(textcat_labels))
)
else:
msg.fail(
"Unsupported textcat configuration. Use `spacy debug-data` "
"for more information."
)
# fmt: off # fmt: off
row_head = ["Itn", "Dep Loss", "NER Loss", "UAS", "NER P", "NER R", "NER F", "Tag %", "Token %", "CPU WPS", "GPU WPS"] row_head, output_stats = _configure_training_output(pipeline, use_gpu, has_beam_widths)
row_widths = [3, 10, 10, 7, 7, 7, 7, 7, 7, 7, 7] row_widths = [len(w) for w in row_head]
if has_beam_widths:
row_head.insert(1, "Beam W.")
row_widths.insert(1, 7)
row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2} row_settings = {"widths": row_widths, "aligns": tuple(["r" for i in row_head]), "spacing": 2}
# fmt: on # fmt: on
print("") print("")
@ -243,7 +344,11 @@ def train(
best_score = 0.0 best_score = 0.0
for i in range(n_iter): for i in range(n_iter):
train_docs = corpus.train_docs( train_docs = corpus.train_docs(
nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0 nlp,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
) )
if raw_text: if raw_text:
random.shuffle(raw_text) random.shuffle(raw_text)
@ -286,7 +391,7 @@ def train(
) )
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs) nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer() start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, debug) scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
end_time = timer() end_time = timer()
if use_gpu < 0: if use_gpu < 0:
gpu_wps = None gpu_wps = None
@ -302,7 +407,7 @@ def train(
corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc) corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
) )
start_time = timer() start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs) scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
end_time = timer() end_time = timer()
cpu_wps = nwords / (end_time - start_time) cpu_wps = nwords / (end_time - start_time)
acc_loc = output_path / ("model%d" % i) / "accuracy.json" acc_loc = output_path / ("model%d" % i) / "accuracy.json"
@ -336,6 +441,7 @@ def train(
} }
meta.setdefault("name", "model%d" % i) meta.setdefault("name", "model%d" % i)
meta.setdefault("version", version) meta.setdefault("version", version)
meta["labels"] = nlp.meta["labels"]
meta_loc = output_path / ("model%d" % i) / "meta.json" meta_loc = output_path / ("model%d" % i) / "meta.json"
srsly.write_json(meta_loc, meta) srsly.write_json(meta_loc, meta)
util.set_env_log(verbose) util.set_env_log(verbose)
@ -344,10 +450,19 @@ def train(
i, i,
losses, losses,
scorer.scores, scorer.scores,
output_stats,
beam_width=beam_width if has_beam_widths else None, beam_width=beam_width if has_beam_widths else None,
cpu_wps=cpu_wps, cpu_wps=cpu_wps,
gpu_wps=gpu_wps, gpu_wps=gpu_wps,
) )
if i == 0 and "textcat" in pipeline:
textcats_per_cat = scorer.scores.get("textcats_per_cat", {})
for cat, cat_score in textcats_per_cat.items():
if cat_score.get("roc_auc_score", 0) < 0:
msg.warn(
"Textcat ROC AUC score is undefined due to "
"only one value in label '{}'.".format(cat)
)
msg.row(progress, **row_settings) msg.row(progress, **row_settings)
# Early stopping # Early stopping
if n_early_stopping is not None: if n_early_stopping is not None:
@ -388,6 +503,8 @@ def _score_for_model(meta):
mean_acc.append((acc["uas"] + acc["las"]) / 2) mean_acc.append((acc["uas"] + acc["las"]) / 2)
if "ner" in pipes: if "ner" in pipes:
mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3) mean_acc.append((acc["ents_p"] + acc["ents_r"] + acc["ents_f"]) / 3)
if "textcat" in pipes:
mean_acc.append(acc["textcat_score"])
return sum(mean_acc) / len(mean_acc) return sum(mean_acc) / len(mean_acc)
@ -471,40 +588,55 @@ def _get_metrics(component):
return ("token_acc",) return ("token_acc",)
def _get_progress(itn, losses, dev_scores, beam_width=None, cpu_wps=0.0, gpu_wps=0.0): def _configure_training_output(pipeline, use_gpu, has_beam_widths):
row_head = ["Itn"]
output_stats = []
for pipe in pipeline:
if pipe == "tagger":
row_head.extend(["Tag Loss ", " Tag % "])
output_stats.extend(["tag_loss", "tags_acc"])
elif pipe == "parser":
row_head.extend(["Dep Loss ", " UAS ", " LAS "])
output_stats.extend(["dep_loss", "uas", "las"])
elif pipe == "ner":
row_head.extend(["NER Loss ", "NER P ", "NER R ", "NER F "])
output_stats.extend(["ner_loss", "ents_p", "ents_r", "ents_f"])
elif pipe == "textcat":
row_head.extend(["Textcat Loss", "Textcat"])
output_stats.extend(["textcat_loss", "textcat_score"])
row_head.extend(["Token %", "CPU WPS"])
output_stats.extend(["token_acc", "cpu_wps"])
if use_gpu >= 0:
row_head.extend(["GPU WPS"])
output_stats.extend(["gpu_wps"])
if has_beam_widths:
row_head.insert(1, "Beam W.")
return row_head, output_stats
def _get_progress(
itn, losses, dev_scores, output_stats, beam_width=None, cpu_wps=0.0, gpu_wps=0.0
):
scores = {} scores = {}
for col in [ for stat in output_stats:
"dep_loss", scores[stat] = 0.0
"tag_loss",
"uas",
"tags_acc",
"token_acc",
"ents_p",
"ents_r",
"ents_f",
"cpu_wps",
"gpu_wps",
]:
scores[col] = 0.0
scores["dep_loss"] = losses.get("parser", 0.0) scores["dep_loss"] = losses.get("parser", 0.0)
scores["ner_loss"] = losses.get("ner", 0.0) scores["ner_loss"] = losses.get("ner", 0.0)
scores["tag_loss"] = losses.get("tagger", 0.0) scores["tag_loss"] = losses.get("tagger", 0.0)
scores.update(dev_scores) scores["textcat_loss"] = losses.get("textcat", 0.0)
scores["cpu_wps"] = cpu_wps scores["cpu_wps"] = cpu_wps
scores["gpu_wps"] = gpu_wps or 0.0 scores["gpu_wps"] = gpu_wps or 0.0
result = [ scores.update(dev_scores)
itn, formatted_scores = []
"{:.3f}".format(scores["dep_loss"]), for stat in output_stats:
"{:.3f}".format(scores["ner_loss"]), format_spec = "{:.3f}"
"{:.3f}".format(scores["uas"]), if stat.endswith("_wps"):
"{:.3f}".format(scores["ents_p"]), format_spec = "{:.0f}"
"{:.3f}".format(scores["ents_r"]), formatted_scores.append(format_spec.format(scores[stat]))
"{:.3f}".format(scores["ents_f"]), result = [itn + 1]
"{:.3f}".format(scores["tags_acc"]), result.extend(formatted_scores)
"{:.3f}".format(scores["token_acc"]),
"{:.0f}".format(scores["cpu_wps"]),
"{:.0f}".format(scores["gpu_wps"]),
]
if beam_width is not None: if beam_width is not None:
result.insert(1, beam_width) result.insert(1, beam_width)
return result return result

View File

@ -84,6 +84,8 @@ class Warnings(object):
W018 = ("Entity '{entity}' already exists in the Knowledge base.") W018 = ("Entity '{entity}' already exists in the Knowledge base.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.") "previously loaded vectors. See Issue #3853.")
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
"loaded. (Shape: {shape})")
@add_codes @add_codes
@ -457,6 +459,16 @@ class Errors(object):
E160 = ("Can't find language data file: {path}") E160 = ("Can't find language data file: {path}")
E161 = ("Found an internal inconsistency when predicting entity links. " E161 = ("Found an internal inconsistency when predicting entity links. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")
E162 = ("Cannot evaluate textcat model on data with different labels.\n"
"Labels in model: {model_labels}\nLabels in evaluation "
"data: {eval_labels}")
E163 = ("cumsum was found to be unstable: its last element does not "
"correspond to sum")
E164 = ("x is neither increasing nor decreasing: {}.")
E165 = ("Only one class present in y_true. ROC AUC score is not defined in "
"that case.")
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
"Current DocBin: {current}\nOther DocBin: {other}")
@add_codes @add_codes

View File

@ -307,4 +307,10 @@ GLOSSARY = {
# https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf # https://pdfs.semanticscholar.org/5744/578cc243d92287f47448870bb426c66cc941.pdf
"PER": "Named person or family.", "PER": "Named person or family.",
"MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art", "MISC": "Miscellaneous entities, e.g. events, nationalities, products or works of art",
# https://github.com/ltgoslo/norne
"EVT": "Festivals, cultural events, sports events, weather phenomena, wars, etc.",
"PROD": "Product, i.e. artificially produced entities including speeches, radio shows, programming languages, contracts, laws and ideas",
"DRV": "Words (and phrases?) that are dervied from a name, but not a name in themselves, e.g. 'Oslo-mannen' ('the man from Oslo')",
"GPE_LOC": "Geo-political entity, with a locative sense, e.g. 'John lives in Spain'",
"GPE_ORG": "Geo-political entity, with an organisation sense, e.g. 'Spain declined to meet with Belgium'",
} }

View File

@ -24,6 +24,7 @@ cdef class GoldParse:
cdef public int loss cdef public int loss
cdef public list words cdef public list words
cdef public list tags cdef public list tags
cdef public list morphology
cdef public list heads cdef public list heads
cdef public list labels cdef public list labels
cdef public dict orths cdef public dict orths

View File

@ -7,6 +7,7 @@ import random
import numpy import numpy
import tempfile import tempfile
import shutil import shutil
import itertools
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -56,6 +57,7 @@ def tags_to_entities(tags):
def merge_sents(sents): def merge_sents(sents):
m_deps = [[], [], [], [], [], []] m_deps = [[], [], [], [], [], []]
m_brackets = [] m_brackets = []
m_cats = sents.pop()
i = 0 i = 0
for (ids, words, tags, heads, labels, ner), brackets in sents: for (ids, words, tags, heads, labels, ner), brackets in sents:
m_deps[0].extend(id_ + i for id_ in ids) m_deps[0].extend(id_ + i for id_ in ids)
@ -67,6 +69,7 @@ def merge_sents(sents):
m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
for b in brackets) for b in brackets)
i += len(ids) i += len(ids)
m_deps.append(m_cats)
return [(m_deps, m_brackets)] return [(m_deps, m_brackets)]
@ -198,6 +201,7 @@ class GoldCorpus(object):
n = 0 n = 0
i = 0 i = 0
for raw_text, paragraph_tuples in self.train_tuples: for raw_text, paragraph_tuples in self.train_tuples:
cats = paragraph_tuples.pop()
for sent_tuples, brackets in paragraph_tuples: for sent_tuples, brackets in paragraph_tuples:
n += len(sent_tuples[1]) n += len(sent_tuples[1])
if self.limit and i >= self.limit: if self.limit and i >= self.limit:
@ -206,13 +210,14 @@ class GoldCorpus(object):
return n return n
def train_docs(self, nlp, gold_preproc=False, max_length=None, def train_docs(self, nlp, gold_preproc=False, max_length=None,
noise_level=0.0): noise_level=0.0, orth_variant_level=0.0):
locs = list((self.tmp_dir / 'train').iterdir()) locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs) random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit) train_tuples = self.read_tuples(locs, limit=self.limit)
gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc,
max_length=max_length, max_length=max_length,
noise_level=noise_level, noise_level=noise_level,
orth_variant_level=orth_variant_level,
make_projective=True) make_projective=True)
yield from gold_docs yield from gold_docs
@ -226,43 +231,132 @@ class GoldCorpus(object):
@classmethod @classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None, def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
noise_level=0.0, make_projective=False): noise_level=0.0, orth_variant_level=0.0, make_projective=False):
for raw_text, paragraph_tuples in tuples: for raw_text, paragraph_tuples in tuples:
if gold_preproc: if gold_preproc:
raw_text = None raw_text = None
else: else:
paragraph_tuples = merge_sents(paragraph_tuples) paragraph_tuples = merge_sents(paragraph_tuples)
docs = cls._make_docs(nlp, raw_text, paragraph_tuples, gold_preproc, docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
noise_level=noise_level) paragraph_tuples, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
golds = cls._make_golds(docs, paragraph_tuples, make_projective) golds = cls._make_golds(docs, paragraph_tuples, make_projective)
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
if (not max_length) or len(doc) < max_length: if (not max_length) or len(doc) < max_length:
yield doc, gold yield doc, gold
@classmethod @classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0): def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
if raw_text is not None: if raw_text is not None:
raw_text, paragraph_tuples = make_orth_variants(nlp, raw_text, paragraph_tuples, orth_variant_level=orth_variant_level)
raw_text = add_noise(raw_text, noise_level) raw_text = add_noise(raw_text, noise_level)
return [nlp.make_doc(raw_text)] return [nlp.make_doc(raw_text)], paragraph_tuples
else: else:
docs = []
raw_text, paragraph_tuples = make_orth_variants(nlp, None, paragraph_tuples, orth_variant_level=orth_variant_level)
return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level)) return [Doc(nlp.vocab, words=add_noise(sent_tuples[1], noise_level))
for (sent_tuples, brackets) in paragraph_tuples] for (sent_tuples, brackets) in paragraph_tuples], paragraph_tuples
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples, make_projective): def _make_golds(cls, docs, paragraph_tuples, make_projective):
if len(docs) != len(paragraph_tuples): if len(docs) != len(paragraph_tuples):
n_annots = len(paragraph_tuples) n_annots = len(paragraph_tuples)
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots)) raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], paragraph_tuples[0][0],
make_projective=make_projective)]
else:
return [GoldParse.from_annot_tuples(doc, sent_tuples, return [GoldParse.from_annot_tuples(doc, sent_tuples,
make_projective=make_projective) make_projective=make_projective)
for doc, (sent_tuples, brackets) for doc, (sent_tuples, brackets)
in zip(docs, paragraph_tuples)] in zip(docs, paragraph_tuples)]
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return raw, paragraph_tuples
if random.random() >= 0.5:
lower = True
if raw is not None:
raw = raw.lower()
ndsv = nlp.Defaults.single_orth_variants
ndpv = nlp.Defaults.paired_orth_variants
# modify words in paragraph_tuples
variant_paragraph_tuples = []
for sent_tuples, brackets in paragraph_tuples:
ids, words, tags, heads, labels, ner, cats = sent_tuples
if lower:
words = [w.lower() for w in words]
# single variants
punct_choices = [random.choice(x["variants"]) for x in ndsv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndsv)):
if tags[word_idx] in ndsv[punct_idx]["tags"] \
and words[word_idx] in ndsv[punct_idx]["variants"]:
words[word_idx] = punct_choices[punct_idx]
# paired variants
punct_choices = [random.choice(x["variants"]) for x in ndpv]
for word_idx in range(len(words)):
for punct_idx in range(len(ndpv)):
if tags[word_idx] in ndpv[punct_idx]["tags"] \
and words[word_idx] in itertools.chain.from_iterable(ndpv[punct_idx]["variants"]):
# backup option: random left vs. right from pair
pair_idx = random.choice([0, 1])
# best option: rely on paired POS tags like `` / ''
if len(ndpv[punct_idx]["tags"]) == 2:
pair_idx = ndpv[punct_idx]["tags"].index(tags[word_idx])
# next best option: rely on position in variants
# (may not be unambiguous, so order of variants matters)
else:
for pair in ndpv[punct_idx]["variants"]:
if words[word_idx] in pair:
pair_idx = pair.index(words[word_idx])
words[word_idx] = punct_choices[punct_idx][pair_idx]
variant_paragraph_tuples.append(((ids, words, tags, heads, labels, ner, cats), brackets))
# modify raw to match variant_paragraph_tuples
if raw is not None:
variants = []
for single_variants in ndsv:
variants.extend(single_variants["variants"])
for paired_variants in ndpv:
variants.extend(list(itertools.chain.from_iterable(paired_variants["variants"])))
# store variants in reverse length order to be able to prioritize
# longer matches (e.g., "---" before "--")
variants = sorted(variants, key=lambda x: len(x))
variants.reverse()
variant_raw = ""
raw_idx = 0
# add initial whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
for sent_tuples, brackets in variant_paragraph_tuples:
ids, words, tags, heads, labels, ner, cats = sent_tuples
for word in words:
match_found = False
# add identical word
if word not in variants and raw[raw_idx:].startswith(word):
variant_raw += word
raw_idx += len(word)
match_found = True
# add variant word
else:
for variant in variants:
if not match_found and \
raw[raw_idx:].startswith(variant):
raw_idx += len(variant)
variant_raw += word
match_found = True
# something went wrong, abort
# (add a warning message?)
if not match_found:
return raw, paragraph_tuples
# add following whitespace
while raw_idx < len(raw) and re.match("\s", raw[raw_idx]):
variant_raw += raw[raw_idx]
raw_idx += 1
return variant_raw, variant_paragraph_tuples
return raw, variant_paragraph_tuples
def add_noise(orig, noise_level): def add_noise(orig, noise_level):
if random.random() >= noise_level: if random.random() >= noise_level:
return orig return orig
@ -277,12 +371,8 @@ def add_noise(orig, noise_level):
def _corrupt(c, noise_level): def _corrupt(c, noise_level):
if random.random() >= noise_level: if random.random() >= noise_level:
return c return c
elif c == " ":
return "\n"
elif c == "\n":
return " "
elif c in [".", "'", "!", "?", ","]: elif c in [".", "'", "!", "?", ","]:
return "" return "\n"
else: else:
return c.lower() return c.lower()
@ -330,6 +420,10 @@ def json_to_tuple(doc):
sents.append([ sents.append([
[ids, words, tags, heads, labels, ner], [ids, words, tags, heads, labels, ner],
sent.get("brackets", [])]) sent.get("brackets", [])])
cats = {}
for cat in paragraph.get("cats", {}):
cats[cat["label"]] = cat["value"]
sents.append(cats)
if sents: if sents:
yield [paragraph.get("raw", None), sents] yield [paragraph.get("raw", None), sents]
@ -443,11 +537,12 @@ cdef class GoldParse:
""" """
@classmethod @classmethod
def from_annot_tuples(cls, doc, annot_tuples, make_projective=False): def from_annot_tuples(cls, doc, annot_tuples, make_projective=False):
_, words, tags, heads, deps, entities = annot_tuples _, words, tags, heads, deps, entities, cats = annot_tuples
return cls(doc, words=words, tags=tags, heads=heads, deps=deps, return cls(doc, words=words, tags=tags, heads=heads, deps=deps,
entities=entities, make_projective=make_projective) entities=entities, cats=cats,
make_projective=make_projective)
def __init__(self, doc, annot_tuples=None, words=None, tags=None, def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
heads=None, deps=None, entities=None, make_projective=False, heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None, **_): cats=None, links=None, **_):
"""Create a GoldParse. """Create a GoldParse.
@ -482,11 +577,13 @@ cdef class GoldParse:
if words is None: if words is None:
words = [token.text for token in doc] words = [token.text for token in doc]
if tags is None: if tags is None:
tags = [None for _ in doc] tags = [None for _ in words]
if heads is None: if heads is None:
heads = [None for token in doc] heads = [None for _ in words]
if deps is None: if deps is None:
deps = [None for _ in doc] deps = [None for _ in words]
if morphology is None:
morphology = [None for _ in words]
if entities is None: if entities is None:
entities = ["-" for _ in doc] entities = ["-" for _ in doc]
elif len(entities) == 0: elif len(entities) == 0:
@ -498,7 +595,6 @@ cdef class GoldParse:
if not isinstance(entities[0], basestring): if not isinstance(entities[0], basestring):
# Assume we have entities specified by character offset. # Assume we have entities specified by character offset.
entities = biluo_tags_from_offsets(doc, entities) entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool() self.mem = Pool()
self.loss = 0 self.loss = 0
self.length = len(doc) self.length = len(doc)
@ -518,6 +614,7 @@ cdef class GoldParse:
self.heads = [None] * len(doc) self.heads = [None] * len(doc)
self.labels = [None] * len(doc) self.labels = [None] * len(doc)
self.ner = [None] * len(doc) self.ner = [None] * len(doc)
self.morphology = [None] * len(doc)
# This needs to be done before we align the words # This needs to be done before we align the words
if make_projective and heads is not None and deps is not None: if make_projective and heads is not None and deps is not None:
@ -544,11 +641,13 @@ cdef class GoldParse:
self.tags[i] = "_SP" self.tags[i] = "_SP"
self.heads[i] = None self.heads[i] = None
self.labels[i] = None self.labels[i] = None
self.ner[i] = "O" self.ner[i] = None
self.morphology[i] = set()
if gold_i is None: if gold_i is None:
if i in i2j_multi: if i in i2j_multi:
self.words[i] = words[i2j_multi[i]] self.words[i] = words[i2j_multi[i]]
self.tags[i] = tags[i2j_multi[i]] self.tags[i] = tags[i2j_multi[i]]
self.morphology[i] = morphology[i2j_multi[i]]
is_last = i2j_multi[i] != i2j_multi.get(i+1) is_last = i2j_multi[i] != i2j_multi.get(i+1)
is_first = i2j_multi[i] != i2j_multi.get(i-1) is_first = i2j_multi[i] != i2j_multi.get(i-1)
# Set next word in multi-token span as head, until last # Set next word in multi-token span as head, until last
@ -585,6 +684,7 @@ cdef class GoldParse:
else: else:
self.words[i] = words[gold_i] self.words[i] = words[gold_i]
self.tags[i] = tags[gold_i] self.tags[i] = tags[gold_i]
self.morphology[i] = morphology[gold_i]
if heads[gold_i] is None: if heads[gold_i] is None:
self.heads[i] = None self.heads[i] = None
else: else:
@ -592,9 +692,20 @@ cdef class GoldParse:
self.labels[i] = deps[gold_i] self.labels[i] = deps[gold_i]
self.ner[i] = entities[gold_i] self.ner[i] = entities[gold_i]
# Prevent whitespace that isn't within entities from being tagged as
# an entity.
for i in range(len(self.ner)):
if self.tags[i] == "_SP":
prev_ner = self.ner[i-1] if i >= 1 else None
next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
if prev_ner == "O" or next_ner == "O":
self.ner[i] = "O"
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle is not None: if cycle is not None:
raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) raise ValueError(Errors.E069.format(cycle=cycle,
cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
doc_tokens=" ".join(words[:50])))
def __len__(self): def __len__(self):
"""Get the number of gold-standard tokens. """Get the number of gold-standard tokens.
@ -638,7 +749,10 @@ def docs_to_json(docs, id=0):
docs = [docs] docs = [docs]
json_doc = {"id": id, "paragraphs": []} json_doc = {"id": id, "paragraphs": []}
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
json_para = {'raw': doc.text, "sentences": []} json_para = {'raw': doc.text, "sentences": [], "cats": []}
for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
biluo_tags = biluo_tags_from_offsets(doc, ent_offsets) biluo_tags = biluo_tags_from_offsets(doc, ent_offsets)
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):

View File

@ -24,7 +24,7 @@ cdef class Candidate:
algorithm which will disambiguate the various candidates to the correct one. algorithm which will disambiguate the various candidates to the correct one.
Each candidate (alias, entity) pair is assigned to a certain prior probability. Each candidate (alias, entity) pair is assigned to a certain prior probability.
DOCS: https://spacy.io/api/candidate DOCS: https://spacy.io/api/kb/#candidate_init
""" """
def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob):

View File

@ -201,7 +201,9 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ"
_upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper
_lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower
_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu _uncased = (
_bengali + _hebrew + _persian + _sinhala + _hindi + _kannada + _tamil + _telugu
)
ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased)
ALPHA_LOWER = group_chars(_lower + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased)

View File

@ -27,6 +27,20 @@ class GermanDefaults(Language.Defaults):
stop_words = STOP_WORDS stop_words = STOP_WORDS
syntax_iterators = SYNTAX_ITERATORS syntax_iterators = SYNTAX_ITERATORS
resources = {"lemma_lookup": "lemma_lookup.json"} resources = {"lemma_lookup": "lemma_lookup.json"}
single_orth_variants = [
{"tags": ["$("], "variants": ["", "..."]},
{"tags": ["$("], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{
"tags": ["$("],
"variants": [("'", "'"), (",", "'"), ("", ""), ("", ""), ("", "")],
},
{
"tags": ["$("],
"variants": [("``", "''"), ('"', '"'), ("", ""), ("»", "«"), ("«", "»")],
},
]
class German(Language): class German(Language):

View File

@ -10,7 +10,7 @@ TAG_MAP = {
"$,": {POS: PUNCT, "PunctType": "comm"}, "$,": {POS: PUNCT, "PunctType": "comm"},
"$.": {POS: PUNCT, "PunctType": "peri"}, "$.": {POS: PUNCT, "PunctType": "peri"},
"ADJA": {POS: ADJ}, "ADJA": {POS: ADJ},
"ADJD": {POS: ADJ, "Variant": "short"}, "ADJD": {POS: ADJ},
"ADV": {POS: ADV}, "ADV": {POS: ADV},
"APPO": {POS: ADP, "AdpType": "post"}, "APPO": {POS: ADP, "AdpType": "post"},
"APPR": {POS: ADP, "AdpType": "prep"}, "APPR": {POS: ADP, "AdpType": "prep"},
@ -32,7 +32,7 @@ TAG_MAP = {
"PDAT": {POS: DET, "PronType": "dem"}, "PDAT": {POS: DET, "PronType": "dem"},
"PDS": {POS: PRON, "PronType": "dem"}, "PDS": {POS: PRON, "PronType": "dem"},
"PIAT": {POS: DET, "PronType": "ind|neg|tot"}, "PIAT": {POS: DET, "PronType": "ind|neg|tot"},
"PIDAT": {POS: DET, "AdjType": "pdt", "PronType": "ind|neg|tot"}, "PIDAT": {POS: DET, "PronType": "ind|neg|tot"},
"PIS": {POS: PRON, "PronType": "ind|neg|tot"}, "PIS": {POS: PRON, "PronType": "ind|neg|tot"},
"PPER": {POS: PRON, "PronType": "prs"}, "PPER": {POS: PRON, "PronType": "prs"},
"PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"}, "PPOSAT": {POS: DET, "Poss": "yes", "PronType": "prs"},
@ -42,7 +42,7 @@ TAG_MAP = {
"PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"}, "PRF": {POS: PRON, "PronType": "prs", "Reflex": "yes"},
"PTKA": {POS: PART}, "PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"}, "PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Polarity": "Neg"}, "PTKNEG": {POS: PART, "Polarity": "neg"},
"PTKVZ": {POS: PART, "PartType": "vbp"}, "PTKVZ": {POS: PART, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"}, "PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"}, "PWAT": {POS: DET, "PronType": "int"},

View File

@ -46,9 +46,10 @@ class GreekLemmatizer(object):
) )
return lemmas return lemmas
def lookup(self, string): def lookup(self, string, orth=None):
if string in self.lookup_table: key = orth if orth is not None else string
return self.lookup_table[string] if key in self.lookup_table:
return self.lookup_table[key]
return string return string

View File

@ -38,6 +38,14 @@ class EnglishDefaults(Language.Defaults):
"lemma_index": "lemmatizer/lemma_index.json", "lemma_index": "lemmatizer/lemma_index.json",
"lemma_exc": "lemmatizer/lemma_exc.json", "lemma_exc": "lemmatizer/lemma_exc.json",
} }
single_orth_variants = [
{"tags": ["NFP"], "variants": ["", "..."]},
{"tags": [":"], "variants": ["-", "", "", "--", "---", "——"]},
]
paired_orth_variants = [
{"tags": ["``", "''"], "variants": [("'", "'"), ("", "")]},
{"tags": ["``", "''"], "variants": [('"', '"'), ("", "")]},
]
class English(Language): class English(Language):

View File

@ -3,55 +3,59 @@ from __future__ import unicode_literals
from ...symbols import LEMMA, PRON_LEMMA from ...symbols import LEMMA, PRON_LEMMA
# Several entries here look pretty suspicious. These will get the POS SCONJ
# given the tag IN, when an adpositional reading seems much more likely for
# a lot of these prepositions. I'm not sure what I was running in 04395ffa4
# when I did this? It doesn't seem right.
_subordinating_conjunctions = [ _subordinating_conjunctions = [
"that", "that",
"if", "if",
"as", "as",
"because", "because",
"of", # "of",
"for", # "for",
"before", # "before",
"in", # "in",
"while", "while",
"after", # "after",
"since", "since",
"like", "like",
"with", # "with",
"so", "so",
"to", # "to",
"by", # "by",
"on", # "on",
"about", # "about",
"than", "than",
"whether", "whether",
"although", "although",
"from", # "from",
"though", "though",
"until", # "until",
"unless", "unless",
"once", "once",
"without", # "without",
"at", # "at",
"into", # "into",
"cause", "cause",
"over", # "over",
"upon", "upon",
"till", "till",
"whereas", "whereas",
"beyond", # "beyond",
"whilst", "whilst",
"except", "except",
"despite", "despite",
"wether", "wether",
"then", # "then",
"but", "but",
"becuse", "becuse",
"whie", "whie",
"below", # "below",
"against", # "against",
"it", "it",
"w/out", "w/out",
"toward", # "toward",
"albeit", "albeit",
"save", "save",
"besides", "besides",
@ -63,16 +67,17 @@ _subordinating_conjunctions = [
"out", "out",
"near", "near",
"seince", "seince",
"towards", # "towards",
"tho", "tho",
"sice", "sice",
"will", "will",
] ]
_relative_pronouns = ["this", "that", "those", "these"] # This seems kind of wrong too?
# _relative_pronouns = ["this", "that", "those", "these"]
MORPH_RULES = { MORPH_RULES = {
"DT": {word: {"POS": "PRON"} for word in _relative_pronouns}, # "DT": {word: {"POS": "PRON"} for word in _relative_pronouns},
"IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions}, "IN": {word: {"POS": "SCONJ"} for word in _subordinating_conjunctions},
"NN": { "NN": {
"something": {"POS": "PRON"}, "something": {"POS": "PRON"},

View File

@ -14,10 +14,10 @@ TAG_MAP = {
'""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, '""': {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
"''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"}, "''": {POS: PUNCT, "PunctType": "quot", "PunctSide": "fin"},
":": {POS: PUNCT}, ":": {POS: PUNCT},
"$": {POS: SYM, "Other": {"SymType": "currency"}}, "$": {POS: SYM},
"#": {POS: SYM, "Other": {"SymType": "numbersign"}}, "#": {POS: SYM},
"AFX": {POS: X, "Hyph": "yes"}, "AFX": {POS: ADJ, "Hyph": "yes"},
"CC": {POS: CCONJ, "ConjType": "coor"}, "CC": {POS: CCONJ, "ConjType": "comp"},
"CD": {POS: NUM, "NumType": "card"}, "CD": {POS: NUM, "NumType": "card"},
"DT": {POS: DET}, "DT": {POS: DET},
"EX": {POS: PRON, "AdvType": "ex"}, "EX": {POS: PRON, "AdvType": "ex"},
@ -34,7 +34,7 @@ TAG_MAP = {
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"}, "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"}, "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
"NNS": {POS: NOUN, "Number": "plur"}, "NNS": {POS: NOUN, "Number": "plur"},
"PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"}, "PDT": {POS: DET},
"POS": {POS: PART, "Poss": "yes"}, "POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"}, "PRP": {POS: PRON, "PronType": "prs"},
"PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"}, "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
@ -56,12 +56,12 @@ TAG_MAP = {
"VerbForm": "fin", "VerbForm": "fin",
"Tense": "pres", "Tense": "pres",
"Number": "sing", "Number": "sing",
"Person": 3, "Person": "three",
}, },
"WDT": {POS: PRON, "PronType": "int|rel"}, "WDT": {POS: PRON},
"WP": {POS: PRON, "PronType": "int|rel"}, "WP": {POS: PRON},
"WP$": {POS: PRON, "Poss": "yes", "PronType": "int|rel"}, "WP$": {POS: PRON, "Poss": "yes"},
"WRB": {POS: ADV, "PronType": "int|rel"}, "WRB": {POS: ADV},
"ADD": {POS: X}, "ADD": {POS: X},
"NFP": {POS: PUNCT}, "NFP": {POS: PUNCT},
"GW": {POS: X}, "GW": {POS: X},

View File

@ -30,14 +30,7 @@ for pron in ["i"]:
for orth in [pron, pron.title()]: for orth in [pron, pron.title()]:
_exc[orth + "'m"] = [ _exc[orth + "'m"] = [
{ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"}, {ORTH: orth, LEMMA: PRON_LEMMA, NORM: pron, TAG: "PRP"},
{ {ORTH: "'m", LEMMA: "be", NORM: "am", TAG: "VBP"},
ORTH: "'m",
LEMMA: "be",
NORM: "am",
TAG: "VBP",
"tenspect": 1,
"number": 1,
},
] ]
_exc[orth + "m"] = [ _exc[orth + "m"] = [

View File

@ -114,9 +114,9 @@ class FrenchLemmatizer(object):
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, "punct", morphology) return self(string, "punct", morphology)
def lookup(self, string): def lookup(self, string, orth=None):
if string in self.lookup_table: if orth is not None and orth in self.lookup_table:
return self.lookup_table[string][0] return self.lookup_table[orth][0]
return string return string

View File

@ -37,6 +37,11 @@ def resolve_pos(token):
in the sentence. This function adds information to the POS tag to in the sentence. This function adds information to the POS tag to
resolve ambiguous mappings. resolve ambiguous mappings.
""" """
# this is only used for consecutive ascii spaces
if token.pos == "空白":
return "空白"
# TODO: This is a first take. The rules here are crude approximations. # TODO: This is a first take. The rules here are crude approximations.
# For many of these, full dependencies are needed to properly resolve # For many of these, full dependencies are needed to properly resolve
# PoS mappings. # PoS mappings.
@ -54,6 +59,7 @@ def detailed_tokens(tokenizer, text):
node = tokenizer.parseToNode(text) node = tokenizer.parseToNode(text)
node = node.next # first node is beginning of sentence and empty, skip it node = node.next # first node is beginning of sentence and empty, skip it
words = [] words = []
spaces = []
while node.posid != 0: while node.posid != 0:
surface = node.surface surface = node.surface
base = surface # a default value. Updated if available later. base = surface # a default value. Updated if available later.
@ -64,8 +70,20 @@ def detailed_tokens(tokenizer, text):
# dictionary # dictionary
base = parts[7] base = parts[7]
words.append(ShortUnitWord(surface, base, pos)) words.append(ShortUnitWord(surface, base, pos))
# The way MeCab stores spaces is that the rlength of the next token is
# the length of that token plus any preceding whitespace, **in bytes**.
# also note that this is only for half-width / ascii spaces. Full width
# spaces just become tokens.
scount = node.next.rlength - node.next.length
spaces.append(bool(scount))
while scount > 1:
words.append(ShortUnitWord(" ", " ", "空白"))
spaces.append(False)
scount -= 1
node = node.next node = node.next
return words return words, spaces
class JapaneseTokenizer(DummyTokenizer): class JapaneseTokenizer(DummyTokenizer):
@ -75,9 +93,8 @@ class JapaneseTokenizer(DummyTokenizer):
self.tokenizer.parseToNode("") # see #2901 self.tokenizer.parseToNode("") # see #2901
def __call__(self, text): def __call__(self, text):
dtokens = detailed_tokens(self.tokenizer, text) dtokens, spaces = detailed_tokens(self.tokenizer, text)
words = [x.surface for x in dtokens] words = [x.surface for x in dtokens]
spaces = [False] * len(words)
doc = Doc(self.vocab, words=words, spaces=spaces) doc = Doc(self.vocab, words=words, spaces=spaces)
mecab_tags = [] mecab_tags = []
for token, dtoken in zip(doc, dtokens): for token, dtoken in zip(doc, dtokens):

View File

@ -2,7 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
TAG_MAP = { TAG_MAP = {
@ -21,6 +21,8 @@ TAG_MAP = {
"感動詞,一般,*,*": {POS: INTJ}, "感動詞,一般,*,*": {POS: INTJ},
# this is specifically for unicode full-width space # this is specifically for unicode full-width space
"空白,*,*,*": {POS: X}, "空白,*,*,*": {POS: X},
# This is used when sequential half-width spaces are present
"空白": {POS: SPACE},
"形状詞,一般,*,*": {POS: ADJ}, "形状詞,一般,*,*": {POS: ADJ},
"形状詞,タリ,*,*": {POS: ADJ}, "形状詞,タリ,*,*": {POS: ADJ},
"形状詞,助動詞語幹,*,*": {POS: ADJ}, "形状詞,助動詞語幹,*,*": {POS: ADJ},

View File

@ -1605,7 +1605,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1613,7 +1613,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1621,7 +1621,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1630,7 +1630,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1638,7 +1638,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1647,7 +1647,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1655,7 +1655,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1664,7 +1664,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1672,7 +1672,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1681,7 +1681,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1689,7 +1689,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1697,7 +1697,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1706,7 +1706,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1714,7 +1714,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1723,7 +1723,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1731,7 +1731,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1739,7 +1739,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1748,7 +1748,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Imp", "Mood": "Imp",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1756,21 +1756,21 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
"Vgm-3---n--ns-": { "Vgm-3---n--ns-": {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
"Vgm-3---n--ys-": { "Vgm-3---n--ys-": {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1778,14 +1778,14 @@ TAG_MAP = {
"Vgm-3---y--ns-": { "Vgm-3---y--ns-": {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
"Vgm-3---y--ys-": { "Vgm-3---y--ys-": {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1794,7 +1794,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1802,7 +1802,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1811,7 +1811,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1819,7 +1819,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1827,7 +1827,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1836,7 +1836,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -1844,7 +1844,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Cnd", "Mood": "Cnd",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1853,7 +1853,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1862,7 +1862,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1872,7 +1872,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1881,7 +1881,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1891,7 +1891,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1900,7 +1900,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1910,7 +1910,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1919,7 +1919,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1929,7 +1929,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1938,7 +1938,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1948,7 +1948,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1957,7 +1957,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1966,7 +1966,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1974,7 +1974,7 @@ TAG_MAP = {
"Vgma3---n--ni-": { "Vgma3---n--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1982,7 +1982,7 @@ TAG_MAP = {
"Vgma3---n--yi-": { "Vgma3---n--yi-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -1991,7 +1991,7 @@ TAG_MAP = {
"Vgma3---y--ni-": { "Vgma3---y--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -1999,7 +1999,7 @@ TAG_MAP = {
"Vgma3--y--ni-": { "Vgma3--y--ni-": {
POS: VERB, POS: VERB,
"Case": "Nom", "Case": "Nom",
"Person": "3", "Person": "three",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
}, },
@ -2007,7 +2007,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2016,7 +2016,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2026,7 +2026,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2035,7 +2035,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2045,7 +2045,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2054,7 +2054,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2064,7 +2064,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2074,7 +2074,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2083,7 +2083,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2093,7 +2093,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2102,7 +2102,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2112,7 +2112,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2121,7 +2121,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2130,7 +2130,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2140,7 +2140,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2149,7 +2149,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2158,7 +2158,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2168,7 +2168,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2177,7 +2177,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2187,7 +2187,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2196,7 +2196,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2205,7 +2205,7 @@ TAG_MAP = {
"Vgmf3---n--ni-": { "Vgmf3---n--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2213,7 +2213,7 @@ TAG_MAP = {
"Vgmf3---y--ni-": { "Vgmf3---y--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2222,7 +2222,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2231,7 +2231,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2241,7 +2241,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2250,7 +2250,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2259,7 +2259,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2269,7 +2269,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Fut", "Tense": "Fut",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2278,7 +2278,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Fut", "Tense": "Fut",
@ -2288,7 +2288,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2297,7 +2297,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2307,7 +2307,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2316,7 +2316,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2326,7 +2326,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2335,7 +2335,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2344,7 +2344,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2354,7 +2354,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2363,7 +2363,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2373,7 +2373,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2382,7 +2382,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2392,7 +2392,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2401,7 +2401,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2411,7 +2411,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2420,7 +2420,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2430,7 +2430,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2438,7 +2438,7 @@ TAG_MAP = {
"Vgmp3---n--ni-": { "Vgmp3---n--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2446,7 +2446,7 @@ TAG_MAP = {
"Vgmp3---n--yi-": { "Vgmp3---n--yi-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2455,7 +2455,7 @@ TAG_MAP = {
"Vgmp3---y--ni-": { "Vgmp3---y--ni-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2463,7 +2463,7 @@ TAG_MAP = {
"Vgmp3---y--yi-": { "Vgmp3---y--yi-": {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2473,7 +2473,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2482,7 +2482,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2492,7 +2492,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2501,7 +2501,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2511,7 +2511,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2520,7 +2520,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2529,7 +2529,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2538,7 +2538,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2548,7 +2548,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Pres", "Tense": "Pres",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2557,7 +2557,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Pres", "Tense": "Pres",
@ -2568,7 +2568,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2578,7 +2578,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2589,7 +2589,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "1", "Person": "one",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2599,7 +2599,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "2", "Person": "two",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2608,7 +2608,7 @@ TAG_MAP = {
POS: VERB, POS: VERB,
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2618,7 +2618,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2628,7 +2628,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Plur", "Number": "Plur",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2639,7 +2639,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2649,7 +2649,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Reflex": "Yes", "Reflex": "Yes",
"Tense": "Past", "Tense": "Past",
@ -2660,7 +2660,7 @@ TAG_MAP = {
"Aspect": "Hab", "Aspect": "Hab",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Neg", "Polarity": "Neg",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",
@ -2670,7 +2670,7 @@ TAG_MAP = {
"Aspect": "Perf", "Aspect": "Perf",
"Mood": "Ind", "Mood": "Ind",
"Number": "Sing", "Number": "Sing",
"Person": "3", "Person": "three",
"Polarity": "Pos", "Polarity": "Pos",
"Tense": "Past", "Tense": "Past",
"VerbForm": "Fin", "VerbForm": "Fin",

View File

@ -103,8 +103,11 @@ class DutchLemmatizer(object):
# Overrides parent method so that a lowercased version of the string is # Overrides parent method so that a lowercased version of the string is
# used to search the lookup table. This is necessary because our lookup # used to search the lookup table. This is necessary because our lookup
# table consists entirely of lowercase keys. # table consists entirely of lowercase keys.
def lookup(self, string): def lookup(self, string, orth=None):
string = string.lower() string = string.lower()
if orth is not None:
return self.lookup_table.get(orth, string)
else:
return self.lookup_table.get(string, string) return self.lookup_table.get(string, string)
def noun(self, string, morphology=None): def noun(self, string, morphology=None):

View File

@ -115,7 +115,7 @@ class RussianLemmatizer(Lemmatizer):
def pron(self, string, morphology=None): def pron(self, string, morphology=None):
return self(string, "pron", morphology) return self(string, "pron", morphology)
def lookup(self, string): def lookup(self, string, orth=None):
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
if len(analyses) == 1: if len(analyses) == 1:
return analyses[0].normal_form return analyses[0].normal_form

View File

@ -112,7 +112,7 @@ class UkrainianLemmatizer(Lemmatizer):
def pron(self, string, morphology=None): def pron(self, string, morphology=None):
return self(string, "pron", morphology) return self(string, "pron", morphology)
def lookup(self, string): def lookup(self, string, orth=None):
analyses = self._morph.parse(string) analyses = self._morph.parse(string)
if len(analyses) == 1: if len(analyses) == 1:
return analyses[0].normal_form return analyses[0].normal_form

View File

@ -20,6 +20,7 @@ from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
from .pipeline import EntityRuler from .pipeline import EntityRuler
from .pipeline import Morphologizer
from .compat import izip, basestring_ from .compat import izip, basestring_
from .gold import GoldParse from .gold import GoldParse
from .scorer import Scorer from .scorer import Scorer
@ -38,6 +39,8 @@ from . import about
class BaseDefaults(object): class BaseDefaults(object):
@classmethod @classmethod
def create_lemmatizer(cls, nlp=None, lookups=None): def create_lemmatizer(cls, nlp=None, lookups=None):
if lookups is None:
lookups = cls.create_lookups(nlp=nlp)
rules, index, exc, lookup = util.get_lemma_tables(lookups) rules, index, exc, lookup = util.get_lemma_tables(lookups)
return Lemmatizer(index, exc, rules, lookup) return Lemmatizer(index, exc, rules, lookup)
@ -108,6 +111,8 @@ class BaseDefaults(object):
syntax_iterators = {} syntax_iterators = {}
resources = {} resources = {}
writing_system = {"direction": "ltr", "has_case": True, "has_letters": True} writing_system = {"direction": "ltr", "has_case": True, "has_letters": True}
single_orth_variants = []
paired_orth_variants = []
class Language(object): class Language(object):
@ -128,6 +133,7 @@ class Language(object):
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp), "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg), "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
"morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
"entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
@ -251,7 +257,8 @@ class Language(object):
@property @property
def pipe_labels(self): def pipe_labels(self):
"""Get the labels set by the pipeline components, if available. """Get the labels set by the pipeline components, if available (if
the component exposes a labels property).
RETURNS (dict): Labels keyed by component name. RETURNS (dict): Labels keyed by component name.
""" """
@ -583,6 +590,7 @@ class Language(object):
# Populate vocab # Populate vocab
else: else:
for _, annots_brackets in get_gold_tuples(): for _, annots_brackets in get_gold_tuples():
_ = annots_brackets.pop()
for annots, _ in annots_brackets: for annots, _ in annots_brackets:
for word in annots[1]: for word in annots[1]:
_ = self.vocab[word] # noqa: F841 _ = self.vocab[word] # noqa: F841
@ -651,7 +659,7 @@ class Language(object):
DOCS: https://spacy.io/api/language#evaluate DOCS: https://spacy.io/api/language#evaluate
""" """
if scorer is None: if scorer is None:
scorer = Scorer() scorer = Scorer(pipeline=self.pipeline)
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
docs, golds = zip(*docs_golds) docs, golds = zip(*docs_golds)

View File

@ -2,8 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from collections import OrderedDict from collections import OrderedDict
from .symbols import POS, NOUN, VERB, ADJ, PUNCT, PROPN from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
class Lemmatizer(object): class Lemmatizer(object):
@ -55,12 +54,8 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely. avoid lemmatization entirely.
""" """
morphology = {} if morphology is None else morphology if morphology is None:
others = [ morphology = {}
key
for key in morphology
if key not in (POS, "Number", "POS", "VerbForm", "Tense")
]
if univ_pos == "noun" and morphology.get("Number") == "sing": if univ_pos == "noun" and morphology.get("Number") == "sing":
return True return True
elif univ_pos == "verb" and morphology.get("VerbForm") == "inf": elif univ_pos == "verb" and morphology.get("VerbForm") == "inf":
@ -71,18 +66,17 @@ class Lemmatizer(object):
morphology.get("VerbForm") == "fin" morphology.get("VerbForm") == "fin"
and morphology.get("Tense") == "pres" and morphology.get("Tense") == "pres"
and morphology.get("Number") is None and morphology.get("Number") is None
and not others
): ):
return True return True
elif univ_pos == "adj" and morphology.get("Degree") == "pos": elif univ_pos == "adj" and morphology.get("Degree") == "pos":
return True return True
elif VerbForm_inf in morphology: elif morphology.get("VerbForm") == "inf":
return True return True
elif VerbForm_none in morphology: elif morphology.get("VerbForm") == "none":
return True return True
elif Number_sing in morphology: elif morphology.get("VerbForm") == "inf":
return True return True
elif Degree_pos in morphology: elif morphology.get("Degree") == "pos":
return True return True
else: else:
return False return False
@ -99,9 +93,19 @@ class Lemmatizer(object):
def punct(self, string, morphology=None): def punct(self, string, morphology=None):
return self(string, "punct", morphology) return self(string, "punct", morphology)
def lookup(self, string): def lookup(self, string, orth=None):
if string in self.lookup_table: """Look up a lemma in the table, if available. If no lemma is found,
return self.lookup_table[string] the original string is returned.
string (unicode): The original string.
orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed.
RETURNS (unicode): The lemma if the string was found, otherwise the
original string.
"""
key = orth if orth is not None else string
if key in self.lookup_table:
return self.lookup_table[key]
return string return string

View File

@ -1,11 +1,13 @@
# coding: utf8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
import srsly import srsly
from collections import OrderedDict from collections import OrderedDict
from preshed.bloom import BloomFilter
from .errors import Errors from .errors import Errors
from .util import SimpleFrozenDict, ensure_path from .util import SimpleFrozenDict, ensure_path
from .strings import get_string_id
class Lookups(object): class Lookups(object):
@ -14,16 +16,14 @@ class Lookups(object):
so they can be accessed before the pipeline components are applied (e.g. so they can be accessed before the pipeline components are applied (e.g.
in the tokenizer and lemmatizer), as well as within the pipeline components in the tokenizer and lemmatizer), as well as within the pipeline components
via doc.vocab.lookups. via doc.vocab.lookups.
Important note: At the moment, this class only performs a very basic
dictionary lookup. We're planning to replace this with a more efficient
implementation. See #3971 for details.
""" """
def __init__(self): def __init__(self):
"""Initialize the Lookups object. """Initialize the Lookups object.
RETURNS (Lookups): The newly created object. RETURNS (Lookups): The newly created object.
DOCS: https://spacy.io/api/lookups#init
""" """
self._tables = OrderedDict() self._tables = OrderedDict()
@ -32,7 +32,7 @@ class Lookups(object):
Lookups.has_table. Lookups.has_table.
name (unicode): Name of the table. name (unicode): Name of the table.
RETURNS (bool): Whether a table of that name exists. RETURNS (bool): Whether a table of that name is in the lookups.
""" """
return self.has_table(name) return self.has_table(name)
@ -51,11 +51,12 @@ class Lookups(object):
name (unicode): Unique name of table. name (unicode): Unique name of table.
data (dict): Optional data to add to the table. data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table. RETURNS (Table): The newly added table.
DOCS: https://spacy.io/api/lookups#add_table
""" """
if name in self.tables: if name in self.tables:
raise ValueError(Errors.E158.format(name=name)) raise ValueError(Errors.E158.format(name=name))
table = Table(name=name) table = Table(name=name, data=data)
table.update(data)
self._tables[name] = table self._tables[name] = table
return table return table
@ -64,6 +65,8 @@ class Lookups(object):
name (unicode): Name of the table. name (unicode): Name of the table.
RETURNS (Table): The table. RETURNS (Table): The table.
DOCS: https://spacy.io/api/lookups#get_table
""" """
if name not in self._tables: if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables)) raise KeyError(Errors.E159.format(name=name, tables=self.tables))
@ -72,8 +75,10 @@ class Lookups(object):
def remove_table(self, name): def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist. """Remove a table. Raises an error if the table doesn't exist.
name (unicode): The name to remove. name (unicode): Name of the table to remove.
RETURNS (Table): The removed table. RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
""" """
if name not in self._tables: if name not in self._tables:
raise KeyError(Errors.E159.format(name=name, tables=self.tables)) raise KeyError(Errors.E159.format(name=name, tables=self.tables))
@ -84,45 +89,57 @@ class Lookups(object):
name (unicode): Name of the table. name (unicode): Name of the table.
RETURNS (bool): Whether a table of that name exists. RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
""" """
return name in self._tables return name in self._tables
def to_bytes(self, exclude=tuple(), **kwargs): def to_bytes(self, **kwargs):
"""Serialize the lookups to a bytestring. """Serialize the lookups to a bytestring.
exclude (list): String names of serialization fields to exclude.
RETURNS (bytes): The serialized Lookups. RETURNS (bytes): The serialized Lookups.
DOCS: https://spacy.io/api/lookups#to_bytes
""" """
return srsly.msgpack_dumps(self._tables) return srsly.msgpack_dumps(self._tables)
def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): def from_bytes(self, bytes_data, **kwargs):
"""Load the lookups from a bytestring. """Load the lookups from a bytestring.
exclude (list): String names of serialization fields to exclude. bytes_data (bytes): The data to load.
RETURNS (bytes): The loaded Lookups. RETURNS (Lookups): The loaded Lookups.
DOCS: https://spacy.io/api/lookups#from_bytes
""" """
self._tables = OrderedDict() for key, value in srsly.msgpack_loads(bytes_data).items():
msg = srsly.msgpack_loads(bytes_data) self._tables[key] = Table(key)
for key, value in msg.items(): self._tables[key].update(value)
self._tables[key] = Table.from_dict(value)
return self return self
def to_disk(self, path, **kwargs): def to_disk(self, path, **kwargs):
"""Save the lookups to a directory as lookups.bin. """Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
path (unicode / Path): The file path. path (unicode / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
""" """
if len(self._tables): if len(self._tables):
path = ensure_path(path) path = ensure_path(path)
if not path.exists():
path.mkdir()
filepath = path / "lookups.bin" filepath = path / "lookups.bin"
with filepath.open("wb") as file_: with filepath.open("wb") as file_:
file_.write(self.to_bytes()) file_.write(self.to_bytes())
def from_disk(self, path, **kwargs): def from_disk(self, path, **kwargs):
"""Load lookups from a directory containing a lookups.bin. """Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
path (unicode / Path): The file path. path (unicode / Path): The directory path.
RETURNS (Lookups): The loaded lookups. RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
""" """
path = ensure_path(path) path = ensure_path(path)
filepath = path / "lookups.bin" filepath = path / "lookups.bin"
@ -136,22 +153,118 @@ class Lookups(object):
class Table(OrderedDict): class Table(OrderedDict):
"""A table in the lookups. Subclass of builtin dict that implements a """A table in the lookups. Subclass of builtin dict that implements a
slightly more consistent and unified API. slightly more consistent and unified API.
Includes a Bloom filter to speed up missed lookups.
""" """
@classmethod @classmethod
def from_dict(cls, data, name=None): def from_dict(cls, data, name=None):
"""Initialize a new table from a dict.
data (dict): The dictionary.
name (unicode): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict
"""
self = cls(name=name) self = cls(name=name)
self.update(data) self.update(data)
return self return self
def __init__(self, name=None): def __init__(self, name=None, data=None):
"""Initialize a new table. """Initialize a new table.
name (unicode): Optional table name for reference. name (unicode): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object. RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.init
""" """
OrderedDict.__init__(self) OrderedDict.__init__(self)
self.name = name self.name = name
# Assume a default size of 1M items
self.default_size = 1e6
size = len(data) if data and len(data) > 0 else self.default_size
self.bloom = BloomFilter.from_error_rate(size)
if data:
self.update(data)
def __setitem__(self, key, value):
"""Set new key/value pair. String keys will be hashed.
key (unicode / int): The key to set.
value: The value to set.
"""
key = get_string_id(key)
OrderedDict.__setitem__(self, key, value)
self.bloom.add(key)
def set(self, key, value): def set(self, key, value):
"""Set new key/value pair. Same as table[key] = value.""" """Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
key (unicode / int): The key to set.
value: The value to set.
"""
self[key] = value self[key] = value
def __getitem__(self, key):
"""Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get.
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.__getitem__(self, key)
def get(self, key, default=None):
"""Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get.
default: The default value to return.
RETURNS: The value.
"""
key = get_string_id(key)
return OrderedDict.get(self, key, default)
def __contains__(self, key):
"""Check whether a key is in the table. String keys will be hashed.
key (unicode / int): The key to check.
RETURNS (bool): Whether the key is in the table.
"""
key = get_string_id(key)
# This can give a false positive, so we need to check it after
if key not in self.bloom:
return False
return OrderedDict.__contains__(self, key)
def to_bytes(self):
"""Serialize table to a bytestring.
RETURNS (bytes): The serialized table.
DOCS: https://spacy.io/api/lookups#table.to_bytes
"""
data = [
("name", self.name),
("dict", dict(self.items())),
("bloom", self.bloom.to_bytes()),
]
return srsly.msgpack_dumps(OrderedDict(data))
def from_bytes(self, bytes_data):
"""Load a table from a bytestring.
bytes_data (bytes): The data to load.
RETURNS (Table): The loaded table.
DOCS: https://spacy.io/api/lookups#table.from_bytes
"""
loaded = srsly.msgpack_loads(bytes_data)
data = loaded.get("dict", {})
self.name = loaded["name"]
self.bloom = BloomFilter().from_bytes(loaded["bloom"])
self.clear()
self.update(data)
return self

View File

@ -1,301 +1,41 @@
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMap, PreshMapArray
from libc.stdint cimport uint64_t from libc.stdint cimport uint64_t
from murmurhash cimport mrmr
from .structs cimport TokenC from .structs cimport TokenC, MorphAnalysisC
from .strings cimport StringStore from .strings cimport StringStore
from .typedefs cimport attr_t, flags_t from .typedefs cimport hash_t, attr_t, flags_t
from .parts_of_speech cimport univ_pos_t from .parts_of_speech cimport univ_pos_t
from . cimport symbols from . cimport symbols
cdef struct RichTagC:
uint64_t morph
int id
univ_pos_t pos
attr_t name
cdef struct MorphAnalysisC:
RichTagC tag
attr_t lemma
cdef class Morphology: cdef class Morphology:
cdef readonly Pool mem cdef readonly Pool mem
cdef readonly StringStore strings cdef readonly StringStore strings
cdef PreshMap tags # Keyed by hash, value is pointer to tag
cdef public object lemmatizer cdef public object lemmatizer
cdef readonly object tag_map cdef readonly object tag_map
cdef public object n_tags cdef readonly object tag_names
cdef public object reverse_index cdef readonly object reverse_index
cdef public object tag_names cdef readonly object exc
cdef public object exc cdef readonly object _feat_map
cdef readonly PreshMapArray _cache
cdef readonly int n_tags
cdef RichTagC* rich_tags cpdef update(self, hash_t morph, features)
cdef PreshMapArray _cache cdef hash_t insert(self, MorphAnalysisC tag) except 0
cdef int assign_untagged(self, TokenC* token) except -1 cdef int assign_untagged(self, TokenC* token) except -1
cdef int assign_tag(self, TokenC* token, tag) except -1 cdef int assign_tag(self, TokenC* token, tag) except -1
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1 cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
cdef enum univ_morph_t: cdef int check_feature(const MorphAnalysisC* tag, attr_t feature) nogil
NIL = 0 cdef attr_t get_field(const MorphAnalysisC* tag, int field) nogil
Animacy_anim = symbols.Animacy_anim cdef list list_features(const MorphAnalysisC* tag)
Animacy_inan
Animacy_hum
Animacy_nhum
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
Definite_two
Definite_def
Definite_red
Definite_cons # U20
Definite_ind
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_abs
Degree_com
Degree_dim # du
Gender_com
Gender_fem
Gender_masc
Gender_neut
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
Negative_neg
Negative_pos
Negative_yes
Polarity_neg # U20
Polarity_pos # U20
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
Person_one
Person_two
Person_three
Person_none
Poss_yes
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
Reflex_yes
Tense_fut
Tense_imp
Tense_past
Tense_pres
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
Abbr_yes # cz, fi, sl, U
AdpType_prep # cz, U
AdpType_post # U
AdpType_voc # cz
AdpType_comprep # cz
AdpType_circ # U
AdvType_man
AdvType_loc
AdvType_tim
AdvType_deg
AdvType_cau
AdvType_mod
AdvType_sta
AdvType_ex
AdvType_adadj
ConjType_oper # cz, U
ConjType_comp # cz, U
Connegative_yes # fi
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
Echo_rdp # U
Echo_ech # U
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
Hyph_yes # cz, U
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
NounType_com # U
NounType_prop # U
NounType_class # U
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
Prefix_yes # U
PrepCase_npr # cz
PrepCase_pre # U
PunctSide_ini # U
PunctSide_fin # U
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
cdef tag_to_json(const MorphAnalysisC* tag)

File diff suppressed because it is too large Load Diff

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker from .pipes import Tagger, DependencyParser, EntityRecognizer, EntityLinker
from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
from .morphologizer import Morphologizer
from .entityruler import EntityRuler from .entityruler import EntityRuler
from .hooks import SentenceSegmenter, SimilarityHook from .hooks import SentenceSegmenter, SimilarityHook
from .functions import merge_entities, merge_noun_chunks, merge_subtokens from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@ -15,6 +16,7 @@ __all__ = [
"TextCategorizer", "TextCategorizer",
"Tensorizer", "Tensorizer",
"Pipe", "Pipe",
"Morphologizer",
"EntityRuler", "EntityRuler",
"Sentencizer", "Sentencizer",
"SentenceSegmenter", "SentenceSegmenter",

View File

@ -0,0 +1,164 @@
from __future__ import unicode_literals
from collections import OrderedDict, defaultdict
import numpy
cimport numpy as np
from thinc.api import chain
from thinc.neural.util import to_categorical, copy_array, get_array_module
from .. import util
from .pipes import Pipe
from .._ml import Tok2Vec, build_morphologizer_model
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import create_default_optimizer
from ..errors import Errors, TempErrors
from ..compat import basestring_
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
class Morphologizer(Pipe):
name = 'morphologizer'
@classmethod
def Model(cls, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
raise ValueError(TempErrors.T008)
class_map = Morphology.create_class_map()
return build_morphologizer_model(class_map.field_sizes, **cfg)
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
self.cfg = OrderedDict(sorted(cfg.items()))
self.cfg.setdefault('cnn_maxout_pieces', 2)
self._class_map = self.vocab.morphology.create_class_map()
@property
def labels(self):
return self.vocab.morphology.tag_names
@property
def tok2vec(self):
if self.model in (None, True, False):
return None
else:
return chain(self.model.tok2vec, flatten)
def __call__(self, doc):
features, tokvecs = self.predict([doc])
self.set_annotations([doc], features, tensors=tokvecs)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in util.minibatch(stream, size=batch_size):
docs = list(docs)
features, tokvecs = self.predict(docs)
self.set_annotations(docs, features, tensors=tokvecs)
yield from docs
def predict(self, docs):
if not any(len(doc) for doc in docs):
# Handle case where there are no tokens in any docs.
n_labels = self.model.nO
guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs]
tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO))
return guesses, tokvecs
tokvecs = self.model.tok2vec(docs)
scores = self.model.softmax(tokvecs)
return scores, tokvecs
def set_annotations(self, docs, batch_scores, tensors=None):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
cdef Vocab vocab = self.vocab
offsets = [self._class_map.get_field_offset(field)
for field in self._class_map.fields]
for i, doc in enumerate(docs):
doc_scores = batch_scores[i]
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)
# Convert the neuron indices into feature IDs.
doc_feat_ids = numpy.zeros((len(doc), len(self._class_map.fields)), dtype='i')
for j in range(len(doc)):
for k, offset in enumerate(offsets):
if doc_guesses[j, k] == 0:
doc_feat_ids[j, k] = 0
else:
doc_feat_ids[j, k] = offset + doc_guesses[j, k]
# Get the set of feature names.
feats = {self._class_map.col2info[f][2] for f in doc_feat_ids[j]}
if "NIL" in feats:
feats.remove("NIL")
# Now add the analysis, and set the hash.
doc.c[j].morph = self.vocab.morphology.add(feats)
if doc[j].morph.pos != 0:
doc.c[j].pos = doc[j].morph.pos
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if losses is not None and self.name not in losses:
losses[self.name] = 0.
tag_scores, bp_tag_scores = self.model.begin_update(docs, drop=drop)
loss, d_tag_scores = self.get_loss(docs, golds, tag_scores)
bp_tag_scores(d_tag_scores, sgd=sgd)
if losses is not None:
losses[self.name] += loss
def get_loss(self, docs, golds, scores):
guesses = []
for doc_scores in scores:
guesses.append(scores_to_guesses(doc_scores, self.model.softmax.out_sizes))
guesses = self.model.ops.xp.vstack(guesses)
scores = self.model.ops.xp.vstack(scores)
if not isinstance(scores, numpy.ndarray):
scores = scores.get()
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
cdef int idx = 0
# Do this on CPU, as we can't vectorize easily.
target = numpy.zeros(scores.shape, dtype='f')
field_sizes = self.model.softmax.out_sizes
for doc, gold in zip(docs, golds):
for t, features in enumerate(gold.morphology):
if features is None:
target[idx] = scores[idx]
else:
gold_fields = {}
for feature in features:
field = self._class_map.feat2field[feature]
gold_fields[field] = self._class_map.feat2offset[feature]
for field in self._class_map.fields:
field_id = self._class_map.field2id[field]
col_offset = self._class_map.field2col[field]
if field_id in gold_fields:
target[idx, col_offset + gold_fields[field_id]] = 1.
else:
target[idx, col_offset] = 1.
#print(doc[t])
#for col, info in enumerate(self._class_map.col2info):
# print(col, info, scores[idx, col], target[idx, col])
idx += 1
target = self.model.ops.asarray(target, dtype='f')
scores = self.model.ops.asarray(scores, dtype='f')
d_scores = scores - target
loss = (d_scores**2).sum()
d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
return float(loss), d_scores
def use_params(self, params):
with self.model.use_params(params):
yield
def scores_to_guesses(scores, out_sizes):
xp = get_array_module(scores)
guesses = xp.zeros((scores.shape[0], len(out_sizes)), dtype='i')
offset = 0
for i, size in enumerate(out_sizes):
slice_ = scores[:, offset : offset + size]
col_guesses = slice_.argmax(axis=1)
guesses[:, i] = col_guesses
offset += size
return guesses

View File

@ -424,18 +424,22 @@ class Tagger(Pipe):
cdef Doc doc cdef Doc doc
cdef int idx = 0 cdef int idx = 0
cdef Vocab vocab = self.vocab cdef Vocab vocab = self.vocab
assign_morphology = self.cfg.get("set_morphology", True)
for i, doc in enumerate(docs): for i, doc in enumerate(docs):
doc_tag_ids = batch_tag_ids[i] doc_tag_ids = batch_tag_ids[i]
if hasattr(doc_tag_ids, "get"): if hasattr(doc_tag_ids, "get"):
doc_tag_ids = doc_tag_ids.get() doc_tag_ids = doc_tag_ids.get()
for j, tag_id in enumerate(doc_tag_ids): for j, tag_id in enumerate(doc_tag_ids):
# Don't clobber preset POS tags # Don't clobber preset POS tags
if doc.c[j].tag == 0 and doc.c[j].pos == 0: if doc.c[j].tag == 0:
if doc.c[j].pos == 0 and assign_morphology:
# Don't clobber preset lemmas # Don't clobber preset lemmas
lemma = doc.c[j].lemma lemma = doc.c[j].lemma
vocab.morphology.assign_tag_id(&doc.c[j], tag_id) vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
if lemma != 0 and lemma != doc.c[j].lex.orth: if lemma != 0 and lemma != doc.c[j].lex.orth:
doc.c[j].lemma = lemma doc.c[j].lemma = lemma
else:
doc.c[j].tag = self.vocab.strings[self.labels[tag_id]]
idx += 1 idx += 1
if tensors is not None and len(tensors): if tensors is not None and len(tensors):
if isinstance(doc.tensor, numpy.ndarray) \ if isinstance(doc.tensor, numpy.ndarray) \
@ -500,6 +504,7 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map) orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict() new_tag_map = OrderedDict()
for raw_text, annots_brackets in get_gold_tuples(): for raw_text, annots_brackets in get_gold_tuples():
_ = annots_brackets.pop()
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
for tag in tags: for tag in tags:
@ -1012,6 +1017,10 @@ class TextCategorizer(Pipe):
return 1 return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
for raw_text, annots_brackets in get_gold_tuples():
cats = annots_brackets.pop()
for cat in cats:
self.add_label(cat)
if self.model is True: if self.model is True:
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
self.require_labels() self.require_labels()

View File

@ -1,7 +1,10 @@
# coding: utf8 # coding: utf8
from __future__ import division, print_function, unicode_literals from __future__ import division, print_function, unicode_literals
import numpy as np
from .gold import tags_to_entities, GoldParse from .gold import tags_to_entities, GoldParse
from .errors import Errors
class PRFScore(object): class PRFScore(object):
@ -34,10 +37,39 @@ class PRFScore(object):
return 2 * ((p * r) / (p + r + 1e-100)) return 2 * ((p * r) / (p + r + 1e-100))
class ROCAUCScore(object):
"""
An AUC ROC score.
"""
def __init__(self):
self.golds = []
self.cands = []
self.saved_score = 0.0
self.saved_score_at_len = 0
def score_set(self, cand, gold):
self.cands.append(cand)
self.golds.append(gold)
@property
def score(self):
if len(self.golds) == self.saved_score_at_len:
return self.saved_score
try:
self.saved_score = _roc_auc_score(self.golds, self.cands)
# catch ValueError: Only one class present in y_true.
# ROC AUC score is not defined in that case.
except ValueError:
self.saved_score = -float("inf")
self.saved_score_at_len = len(self.golds)
return self.saved_score
class Scorer(object): class Scorer(object):
"""Compute evaluation scores.""" """Compute evaluation scores."""
def __init__(self, eval_punct=False): def __init__(self, eval_punct=False, pipeline=None):
"""Initialize the Scorer. """Initialize the Scorer.
eval_punct (bool): Evaluate the dependency attachments to and from eval_punct (bool): Evaluate the dependency attachments to and from
@ -54,6 +86,24 @@ class Scorer(object):
self.ner = PRFScore() self.ner = PRFScore()
self.ner_per_ents = dict() self.ner_per_ents = dict()
self.eval_punct = eval_punct self.eval_punct = eval_punct
self.textcat = None
self.textcat_per_cat = dict()
self.textcat_positive_label = None
self.textcat_multilabel = False
if pipeline:
for name, model in pipeline:
if name == "textcat":
self.textcat_positive_label = model.cfg.get("positive_label", None)
if self.textcat_positive_label:
self.textcat = PRFScore()
if not model.cfg.get("exclusive_classes", False):
self.textcat_multilabel = True
for label in model.cfg.get("labels", []):
self.textcat_per_cat[label] = ROCAUCScore()
else:
for label in model.cfg.get("labels", []):
self.textcat_per_cat[label] = PRFScore()
@property @property
def tags_acc(self): def tags_acc(self):
@ -101,10 +151,47 @@ class Scorer(object):
for k, v in self.ner_per_ents.items() for k, v in self.ner_per_ents.items()
} }
@property
def textcat_score(self):
"""RETURNS (float): f-score on positive label for binary exclusive,
macro-averaged f-score for 3+ exclusive,
macro-averaged AUC ROC score for multilabel (-1 if undefined)
"""
if not self.textcat_multilabel:
# binary multiclass
if self.textcat_positive_label:
return self.textcat.fscore * 100
# other multiclass
return (
sum([score.fscore for label, score in self.textcat_per_cat.items()])
/ (len(self.textcat_per_cat) + 1e-100)
* 100
)
# multilabel
return max(
sum([score.score for label, score in self.textcat_per_cat.items()])
/ (len(self.textcat_per_cat) + 1e-100),
-1,
)
@property
def textcats_per_cat(self):
"""RETURNS (dict): Scores per textcat label.
"""
if not self.textcat_multilabel:
return {
k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
for k, v in self.textcat_per_cat.items()
}
return {
k: {"roc_auc_score": max(v.score, -1)}
for k, v in self.textcat_per_cat.items()
}
@property @property
def scores(self): def scores(self):
"""RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`,
`ents_r`, `ents_f`, `tags_acc` and `token_acc`. `ents_r`, `ents_f`, `tags_acc`, `token_acc`, and `textcat_score`.
""" """
return { return {
"uas": self.uas, "uas": self.uas,
@ -115,6 +202,8 @@ class Scorer(object):
"ents_per_type": self.ents_per_type, "ents_per_type": self.ents_per_type,
"tags_acc": self.tags_acc, "tags_acc": self.tags_acc,
"token_acc": self.token_acc, "token_acc": self.token_acc,
"textcat_score": self.textcat_score,
"textcats_per_cat": self.textcats_per_cat,
} }
def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")):
@ -192,9 +281,301 @@ class Scorer(object):
self.unlabelled.score_set( self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps) set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
) )
if (
len(gold.cats) > 0
and set(self.textcat_per_cat) == set(gold.cats)
and set(gold.cats) == set(doc.cats)
):
goldcat = max(gold.cats, key=gold.cats.get)
candcat = max(doc.cats, key=doc.cats.get)
if self.textcat_positive_label:
self.textcat.score_set(
set([self.textcat_positive_label]) & set([candcat]),
set([self.textcat_positive_label]) & set([goldcat]),
)
for label in self.textcat_per_cat:
if self.textcat_multilabel:
self.textcat_per_cat[label].score_set(
doc.cats[label], gold.cats[label]
)
else:
self.textcat_per_cat[label].score_set(
set([label]) & set([candcat]), set([label]) & set([goldcat])
)
elif len(self.textcat_per_cat) > 0:
model_labels = set(self.textcat_per_cat)
eval_labels = set(gold.cats)
raise ValueError(
Errors.E162.format(model_labels=model_labels, eval_labels=eval_labels)
)
if verbose: if verbose:
gold_words = [item[1] for item in gold.orig_annot] gold_words = [item[1] for item in gold.orig_annot]
for w_id, h_id, dep in cand_deps - gold_deps: for w_id, h_id, dep in cand_deps - gold_deps:
print("F", gold_words[w_id], dep, gold_words[h_id]) print("F", gold_words[w_id], dep, gold_words[h_id])
for w_id, h_id, dep in gold_deps - cand_deps: for w_id, h_id, dep in gold_deps - cand_deps:
print("M", gold_words[w_id], dep, gold_words[h_id]) print("M", gold_words[w_id], dep, gold_words[h_id])
#############################################################################
#
# The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the following license:
#
# New BSD License
#
# Copyright (c) 20072019 The scikit-learn developers.
# All rights reserved.
#
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# a. Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# b. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# c. Neither the name of the Scikit-learn Developers nor the names of
# its contributors may be used to endorse or promote products
# derived from this software without specific prior written
# permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
# DAMAGE.
def _roc_auc_score(y_true, y_score):
"""Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
from prediction scores.
Note: this implementation is restricted to the binary classification task
Parameters
----------
y_true : array, shape = [n_samples] or [n_samples, n_classes]
True binary labels or binary label indicators.
The multiclass case expects shape = [n_samples] and labels
with values in ``range(n_classes)``.
y_score : array, shape = [n_samples] or [n_samples, n_classes]
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by "decision_function" on some classifiers). For binary
y_true, y_score is supposed to be the score of the class with greater
label. The multiclass case expects shape = [n_samples, n_classes]
where the scores correspond to probability estimates.
Returns
-------
auc : float
References
----------
.. [1] `Wikipedia entry for the Receiver operating characteristic
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
Letters, 2006, 27(8):861-874.
.. [3] `Analyzing a portion of the ROC curve. McClish, 1989
<https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
"""
if len(np.unique(y_true)) != 2:
raise ValueError(Errors.E165)
fpr, tpr, _ = _roc_curve(y_true, y_score)
return _auc(fpr, tpr)
def _roc_curve(y_true, y_score):
"""Compute Receiver operating characteristic (ROC)
Note: this implementation is restricted to the binary classification task.
Parameters
----------
y_true : array, shape = [n_samples]
True binary labels. If labels are not either {-1, 1} or {0, 1}, then
pos_label should be explicitly given.
y_score : array, shape = [n_samples]
Target scores, can either be probability estimates of the positive
class, confidence values, or non-thresholded measure of decisions
(as returned by "decision_function" on some classifiers).
Returns
-------
fpr : array, shape = [>2]
Increasing false positive rates such that element i is the false
positive rate of predictions with score >= thresholds[i].
tpr : array, shape = [>2]
Increasing true positive rates such that element i is the true
positive rate of predictions with score >= thresholds[i].
thresholds : array, shape = [n_thresholds]
Decreasing thresholds on the decision function used to compute
fpr and tpr. `thresholds[0]` represents no instances being predicted
and is arbitrarily set to `max(y_score) + 1`.
Notes
-----
Since the thresholds are sorted from low to high values, they
are reversed upon returning them to ensure they correspond to both ``fpr``
and ``tpr``, which are sorted in reversed order during their calculation.
References
----------
.. [1] `Wikipedia entry for the Receiver operating characteristic
<https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
.. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
Letters, 2006, 27(8):861-874.
"""
fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
# Add an extra threshold position
# to make sure that the curve starts at (0, 0)
tps = np.r_[0, tps]
fps = np.r_[0, fps]
thresholds = np.r_[thresholds[0] + 1, thresholds]
if fps[-1] <= 0:
fpr = np.repeat(np.nan, fps.shape)
else:
fpr = fps / fps[-1]
if tps[-1] <= 0:
tpr = np.repeat(np.nan, tps.shape)
else:
tpr = tps / tps[-1]
return fpr, tpr, thresholds
def _binary_clf_curve(y_true, y_score):
"""Calculate true and false positives per binary classification threshold.
Parameters
----------
y_true : array, shape = [n_samples]
True targets of binary classification
y_score : array, shape = [n_samples]
Estimated probabilities or decision function
Returns
-------
fps : array, shape = [n_thresholds]
A count of false positives, at index i being the number of negative
samples assigned a score >= thresholds[i]. The total number of
negative samples is equal to fps[-1] (thus true negatives are given by
fps[-1] - fps).
tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
An increasing count of true positives, at index i being the number
of positive samples assigned a score >= thresholds[i]. The total
number of positive samples is equal to tps[-1] (thus false negatives
are given by tps[-1] - tps).
thresholds : array, shape = [n_thresholds]
Decreasing score values.
"""
pos_label = 1.0
y_true = np.ravel(y_true)
y_score = np.ravel(y_score)
# make y_true a boolean vector
y_true = y_true == pos_label
# sort scores and corresponding truth values
desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
y_score = y_score[desc_score_indices]
y_true = y_true[desc_score_indices]
weight = 1.0
# y_score typically has many tied values. Here we extract
# the indices associated with the distinct values. We also
# concatenate a value for the end of the curve.
distinct_value_indices = np.where(np.diff(y_score))[0]
threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
# accumulate the true positives with decreasing threshold
tps = _stable_cumsum(y_true * weight)[threshold_idxs]
fps = 1 + threshold_idxs - tps
return fps, tps, y_score[threshold_idxs]
def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
"""Use high precision for cumsum and check that final value matches sum
Parameters
----------
arr : array-like
To be cumulatively summed as flat
axis : int, optional
Axis along which the cumulative sum is computed.
The default (None) is to compute the cumsum over the flattened array.
rtol : float
Relative tolerance, see ``np.allclose``
atol : float
Absolute tolerance, see ``np.allclose``
"""
out = np.cumsum(arr, axis=axis, dtype=np.float64)
expected = np.sum(arr, axis=axis, dtype=np.float64)
if not np.all(
np.isclose(
out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
)
):
raise ValueError(Errors.E163)
return out
def _auc(x, y):
"""Compute Area Under the Curve (AUC) using the trapezoidal rule
This is a general function, given points on a curve. For computing the
area under the ROC-curve, see :func:`roc_auc_score`.
Parameters
----------
x : array, shape = [n]
x coordinates. These must be either monotonic increasing or monotonic
decreasing.
y : array, shape = [n]
y coordinates.
Returns
-------
auc : float
"""
x = np.ravel(x)
y = np.ravel(y)
direction = 1
dx = np.diff(x)
if np.any(dx < 0):
if np.all(dx <= 0):
direction = -1
else:
raise ValueError(Errors.E164.format(x))
area = direction * np.trapz(y, x)
if isinstance(area, np.memmap):
# Reductions such as .sum used internally in np.trapz do not return a
# scalar by default for numpy.memmap instances contrary to
# regular numpy.ndarray instances.
area = area.dtype.type(area)
return area

View File

@ -119,9 +119,7 @@ cdef class StringStore:
return "" return ""
elif string_or_id in SYMBOLS_BY_STR: elif string_or_id in SYMBOLS_BY_STR:
return SYMBOLS_BY_STR[string_or_id] return SYMBOLS_BY_STR[string_or_id]
cdef hash_t key cdef hash_t key
if isinstance(string_or_id, unicode): if isinstance(string_or_id, unicode):
key = hash_string(string_or_id) key = hash_string(string_or_id)
return key return key
@ -139,6 +137,20 @@ cdef class StringStore:
else: else:
return decode_Utf8Str(utf8str) return decode_Utf8Str(utf8str)
def as_int(self, key):
"""If key is an int, return it; otherwise, get the int value."""
if not isinstance(key, basestring):
return key
else:
return self[key]
def as_string(self, key):
"""If key is a string, return it; otherwise, get the string value."""
if isinstance(key, basestring):
return key
else:
return self[key]
def add(self, string): def add(self, string):
"""Add a string to the StringStore. """Add a string to the StringStore.

View File

@ -78,6 +78,54 @@ cdef struct TokenC:
hash_t ent_id hash_t ent_id
cdef struct MorphAnalysisC:
univ_pos_t pos
int length
attr_t abbr
attr_t adp_type
attr_t adv_type
attr_t animacy
attr_t aspect
attr_t case
attr_t conj_type
attr_t connegative
attr_t definite
attr_t degree
attr_t derivation
attr_t echo
attr_t foreign
attr_t gender
attr_t hyph
attr_t inf_form
attr_t mood
attr_t negative
attr_t number
attr_t name_type
attr_t noun_type
attr_t num_form
attr_t num_type
attr_t num_value
attr_t part_form
attr_t part_type
attr_t person
attr_t polite
attr_t polarity
attr_t poss
attr_t prefix
attr_t prep_case
attr_t pron_type
attr_t punct_side
attr_t punct_type
attr_t reflex
attr_t style
attr_t style_variant
attr_t tense
attr_t typo
attr_t verb_form
attr_t voice
attr_t verb_type
# Internal struct, for storage and disambiguation of entities. # Internal struct, for storage and disambiguation of entities.
cdef struct KBEntryC: cdef struct KBEntryC:

View File

@ -342,6 +342,7 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1 actions[RIGHT][label] = 1
actions[REDUCE][label] = 1 actions[REDUCE][label] = 1
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
_ = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents: for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels) heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels): for child, head, label in zip(ids, heads, labels):

View File

@ -73,6 +73,7 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1 actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U') moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []): for raw_text, sents in kwargs.get('gold_parses', []):
_ = sents.pop()
for (ids, words, tags, heads, labels, biluo), _ in sents: for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo): for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-': if ner_tag != 'O' and ner_tag != '-':

View File

@ -587,6 +587,7 @@ cdef class Parser:
doc_sample = [] doc_sample = []
gold_sample = [] gold_sample = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000): for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
_ = annots_brackets.pop()
for annots, brackets in annots_brackets: for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words)) doc_sample.append(Doc(self.vocab, words=words))

View File

@ -5,11 +5,13 @@ import pytest
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.lemmatizer import Lemmatizer from spacy.lemmatizer import Lemmatizer
from spacy.lookups import Table
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return Lemmatizer(lookup={"dogs": "dog", "boxen": "box", "mice": "mouse"}) lookup = Table(data={"dogs": "dog", "boxen": "box", "mice": "mouse"})
return Lemmatizer(lookup=lookup)
@pytest.fixture @pytest.fixture

View File

@ -0,0 +1,33 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
@pytest.fixture
def i_has(en_tokenizer):
doc = en_tokenizer("I has")
doc[0].tag_ = "PRP"
doc[1].tag_ = "VBZ"
return doc
def test_token_morph_id(i_has):
assert i_has[0].morph.id
assert i_has[1].morph.id != 0
assert i_has[0].morph.id != i_has[1].morph.id
def test_morph_props(i_has):
assert i_has[0].morph.pron_type == i_has.vocab.strings["PronType_prs"]
assert i_has[0].morph.pron_type_ == "PronType_prs"
assert i_has[1].morph.pron_type == 0
def test_morph_iter(i_has):
assert list(i_has[0].morph) == ["PronType_prs"]
assert list(i_has[1].morph) == ["Number_sing", "Person_three", "VerbForm_fin"]
def test_morph_get(i_has):
assert i_has[0].morph.get("pron_type") == "PronType_prs"

View File

@ -47,3 +47,10 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos): def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
pos = [token.pos_ for token in ja_tokenizer(text)] pos = [token.pos_ for token in ja_tokenizer(text)]
assert pos == expected_pos assert pos == expected_pos
def test_extra_spaces(ja_tokenizer):
# note: three spaces after "I"
tokens = ja_tokenizer("I like cheese.")
assert tokens[1].orth_ == " "
assert tokens[2].orth_ == " "

View File

@ -17,4 +17,4 @@ TEST_CASES = [
@pytest.mark.parametrize("tokens,lemmas", TEST_CASES) @pytest.mark.parametrize("tokens,lemmas", TEST_CASES)
def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas): def test_lt_lemmatizer(lt_lemmatizer, tokens, lemmas):
assert lemmas == [lt_lemmatizer.lookup(token) for token in tokens] assert lemmas == [lt_lemmatizer.lookup_table.get(token, token) for token in tokens]

View File

View File

@ -0,0 +1,48 @@
# coding: utf-8
from __future__ import unicode_literals
import pytest
from spacy.morphology import Morphology
from spacy.strings import StringStore, get_string_id
from spacy.lemmatizer import Lemmatizer
@pytest.fixture
def morphology():
return Morphology(StringStore(), {}, Lemmatizer())
def test_init(morphology):
pass
def test_add_morphology_with_string_names(morphology):
morphology.add({"Case_gen", "Number_sing"})
def test_add_morphology_with_int_ids(morphology):
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
def test_add_morphology_with_mix_strings_and_ints(morphology):
morphology.add({get_string_id("PunctSide_ini"), "VerbType_aux"})
def test_morphology_tags_hash_distinctly(morphology):
tag1 = morphology.add({"PunctSide_ini", "VerbType_aux"})
tag2 = morphology.add({"Case_gen", "Number_sing"})
assert tag1 != tag2
def test_morphology_tags_hash_independent_of_order(morphology):
tag1 = morphology.add({"Case_gen", "Number_sing"})
tag2 = morphology.add({"Number_sing", "Case_gen"})
assert tag1 == tag2
def test_update_morphology_tag(morphology):
tag1 = morphology.add({"Case_gen"})
tag2 = morphology.update(tag1, {"Number_sing"})
assert tag1 != tag2
tag3 = morphology.add({"Number_sing", "Case_gen"})
assert tag2 == tag3

View File

@ -82,6 +82,51 @@ def test_get_oracle_moves_negative_O(tsys, vocab):
assert names assert names
def test_oracle_moves_missing_B(en_vocab):
words = ["B", "52", "Bomber"]
biluo_tags = [None, None, "L-PRODUCT"]
doc = Doc(en_vocab, words=words)
gold = GoldParse(doc, words=words, entities=biluo_tags)
moves = BiluoPushDown(en_vocab.strings)
move_types = ("M", "B", "I", "L", "U", "O")
for tag in biluo_tags:
if tag is None:
continue
elif tag == "O":
moves.add_action(move_types.index("O"), "")
else:
action, label = tag.split("-")
moves.add_action(move_types.index("B"), label)
moves.add_action(move_types.index("I"), label)
moves.add_action(move_types.index("L"), label)
moves.add_action(move_types.index("U"), label)
moves.preprocess_gold(gold)
seq = moves.get_oracle_sequence(doc, gold)
def test_oracle_moves_whitespace(en_vocab):
words = ["production", "\n", "of", "Northrop", "\n", "Corp.", "\n", "'s", "radar"]
biluo_tags = ["O", "O", "O", "B-ORG", None, "I-ORG", "L-ORG", "O", "O"]
doc = Doc(en_vocab, words=words)
gold = GoldParse(doc, words=words, entities=biluo_tags)
moves = BiluoPushDown(en_vocab.strings)
move_types = ("M", "B", "I", "L", "U", "O")
for tag in biluo_tags:
if tag is None:
continue
elif tag == "O":
moves.add_action(move_types.index("O"), "")
else:
action, label = tag.split("-")
moves.add_action(move_types.index(action), label)
moves.preprocess_gold(gold)
moves.get_oracle_sequence(doc, gold)
def test_accept_blocked_token(): def test_accept_blocked_token():
"""Test succesful blocking of tokens to be in an entity.""" """Test succesful blocking of tokens to be in an entity."""
# 1. test normal behaviour # 1. test normal behaviour

View File

@ -187,7 +187,7 @@ def test_issue1799():
def test_issue1807(): def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab.""" """Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab() vocab = Vocab(vectors_name="test_issue1807")
assert "hello" not in vocab assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f")) vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab assert "hello" in vocab

View File

@ -184,7 +184,7 @@ def test_issue2833(en_vocab):
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"] words = ["dog", "cat", "SUFFIX"]
vocab = Vocab() vocab = Vocab(vectors_name="test_issue2871")
vocab.vectors.resize(shape=(3, 10)) vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype="f") vector_data = numpy.zeros((3, 10), dtype="f")
for word in words: for word in words:

View File

@ -30,20 +30,20 @@ def test_issue3002():
def test_issue3009(en_vocab): def test_issue3009(en_vocab):
"""Test problem with matcher quantifiers""" """Test problem with matcher quantifiers"""
patterns = [ patterns = [
[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}], [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"TAG": "IN"}],
[ [
{"LEMMA": "have"}, {"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"},
{"LOWER": "to"}, {"LOWER": "to"},
{"LOWER": "do"}, {"LOWER": "do"},
{"POS": "ADP"}, {"TAG": "IN"},
], ],
[ [
{"LEMMA": "have"}, {"LEMMA": "have"},
{"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"},
{"LOWER": "to"}, {"LOWER": "to"},
{"LOWER": "do"}, {"LOWER": "do"},
{"POS": "ADP"}, {"TAG": "IN"},
], ],
] ]
words = ["also", "has", "to", "do", "with"] words = ["also", "has", "to", "do", "with"]

View File

@ -32,7 +32,7 @@ def test_displacy_parse_deps(en_vocab):
assert isinstance(deps, dict) assert isinstance(deps, dict)
assert deps["words"] == [ assert deps["words"] == [
{"text": "This", "tag": "DET"}, {"text": "This", "tag": "DET"},
{"text": "is", "tag": "VERB"}, {"text": "is", "tag": "AUX"},
{"text": "a", "tag": "DET"}, {"text": "a", "tag": "DET"},
{"text": "sentence", "tag": "NOUN"}, {"text": "sentence", "tag": "NOUN"},
] ]

View File

@ -3,8 +3,12 @@ from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
from spacy.gold import spans_from_biluo_tags, GoldParse from spacy.gold import spans_from_biluo_tags, GoldParse
from spacy.gold import GoldCorpus, docs_to_json
from spacy.lang.en import English
from spacy.tokens import Doc from spacy.tokens import Doc
from .util import make_tempdir
import pytest import pytest
import srsly
def test_gold_biluo_U(en_vocab): def test_gold_biluo_U(en_vocab):
@ -81,3 +85,28 @@ def test_gold_ner_missing_tags(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.") doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = [None, "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841 gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
def test_roundtrip_docs_to_json():
text = "I flew to Silicon Valley via London."
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
doc = nlp(text)
doc.cats = cats
doc[0].is_sent_start = True
for i in range(1, len(doc)):
doc[i].is_sent_start = False
with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
goldcorpus = GoldCorpus(str(json_file), str(json_file))
reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_doc.text
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]

View File

@ -1,9 +1,12 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx from pytest import approx
from spacy.gold import GoldParse from spacy.gold import GoldParse
from spacy.scorer import Scorer from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc from .util import get_doc
test_ner_cardinal = [ test_ner_cardinal = [
@ -66,3 +69,73 @@ def test_ner_per_type(en_vocab):
assert results["ents_per_type"]["ORG"]["p"] == 50 assert results["ents_per_type"]["ORG"]["p"] == 50
assert results["ents_per_type"]["ORG"]["r"] == 100 assert results["ents_per_type"]["ORG"]["r"] == 100
assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666) assert results["ents_per_type"]["ORG"]["f"] == approx(66.66666)
def test_roc_auc_score():
# Binary classification, toy tests from scikit-learn test suite
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.0)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 0, 1])
assert_array_almost_equal(fpr, [0, 1, 1])
assert_almost_equal(roc_auc, 1.0)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = _roc_curve(y_true, y_score)
roc_auc = _roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
# same result as above with ROCAUCScore wrapper
score = ROCAUCScore()
score.score_set(0.5, 1)
score.score_set(0.5, 0)
assert_almost_equal(score.score, 0.5)
# check that errors are raised in undefined cases and score is -inf
y_true = [0, 0]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 0)
score.score_set(0.75, 0)
assert score.score == -float("inf")
y_true = [1, 1]
y_score = [0.25, 0.75]
with pytest.raises(ValueError):
_roc_auc_score(y_true, y_score)
score = ROCAUCScore()
score.score_set(0.25, 1)
score.score_set(0.75, 1)
assert score.score == -float("inf")

View File

@ -2,7 +2,8 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import pytest import pytest
from spacy.lookups import Lookups from spacy.lookups import Lookups, Table
from spacy.strings import get_string_id
from spacy.vocab import Vocab from spacy.vocab import Vocab
from ..util import make_tempdir from ..util import make_tempdir
@ -19,9 +20,9 @@ def test_lookups_api():
table = lookups.get_table(table_name) table = lookups.get_table(table_name)
assert table.name == table_name assert table.name == table_name
assert len(table) == 2 assert len(table) == 2
assert table.get("hello") == "world" assert table["hello"] == "world"
table.set("a", "b") table["a"] = "b"
assert table.get("a") == "b" assert table["a"] == "b"
table = lookups.get_table(table_name) table = lookups.get_table(table_name)
assert len(table) == 3 assert len(table) == 3
with pytest.raises(KeyError): with pytest.raises(KeyError):
@ -36,8 +37,44 @@ def test_lookups_api():
lookups.get_table(table_name) lookups.get_table(table_name)
# This fails on Python 3.5 def test_table_api():
@pytest.mark.xfail table = Table(name="table")
assert table.name == "table"
assert len(table) == 0
assert "abc" not in table
data = {"foo": "bar", "hello": "world"}
table = Table(name="table", data=data)
assert len(table) == len(data)
assert "foo" in table
assert get_string_id("foo") in table
assert table["foo"] == "bar"
assert table[get_string_id("foo")] == "bar"
assert table.get("foo") == "bar"
assert table.get("abc") is None
table["abc"] = 123
assert table["abc"] == 123
assert table[get_string_id("abc")] == 123
table.set("def", 456)
assert table["def"] == 456
assert table[get_string_id("def")] == 456
def test_table_api_to_from_bytes():
data = {"foo": "bar", "hello": "world", "abc": 123}
table = Table(name="table", data=data)
table_bytes = table.to_bytes()
new_table = Table().from_bytes(table_bytes)
assert new_table.name == "table"
assert len(new_table) == 3
assert new_table["foo"] == "bar"
assert new_table[get_string_id("foo")] == "bar"
new_table2 = Table(data={"def": 456})
new_table2.from_bytes(table_bytes)
assert len(new_table2) == 3
assert "def" not in new_table2
@pytest.mark.skip(reason="This fails on Python 3.5")
def test_lookups_to_from_bytes(): def test_lookups_to_from_bytes():
lookups = Lookups() lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table1", {"foo": "bar", "hello": "world"})
@ -50,15 +87,14 @@ def test_lookups_to_from_bytes():
assert "table2" in new_lookups assert "table2" in new_lookups
table1 = new_lookups.get_table("table1") table1 = new_lookups.get_table("table1")
assert len(table1) == 2 assert len(table1) == 2
assert table1.get("foo") == "bar" assert table1["foo"] == "bar"
table2 = new_lookups.get_table("table2") table2 = new_lookups.get_table("table2")
assert len(table2) == 3 assert len(table2) == 3
assert table2.get("b") == 2 assert table2["b"] == 2
assert new_lookups.to_bytes() == lookups_bytes assert new_lookups.to_bytes() == lookups_bytes
# This fails on Python 3.5 @pytest.mark.skip(reason="This fails on Python 3.5")
@pytest.mark.xfail
def test_lookups_to_from_disk(): def test_lookups_to_from_disk():
lookups = Lookups() lookups = Lookups()
lookups.add_table("table1", {"foo": "bar", "hello": "world"}) lookups.add_table("table1", {"foo": "bar", "hello": "world"})
@ -72,14 +108,13 @@ def test_lookups_to_from_disk():
assert "table2" in new_lookups assert "table2" in new_lookups
table1 = new_lookups.get_table("table1") table1 = new_lookups.get_table("table1")
assert len(table1) == 2 assert len(table1) == 2
assert table1.get("foo") == "bar" assert table1["foo"] == "bar"
table2 = new_lookups.get_table("table2") table2 = new_lookups.get_table("table2")
assert len(table2) == 3 assert len(table2) == 3
assert table2.get("b") == 2 assert table2["b"] == 2
# This fails on Python 3.5 @pytest.mark.skip(reason="This fails on Python 3.5")
@pytest.mark.xfail
def test_lookups_to_from_bytes_via_vocab(): def test_lookups_to_from_bytes_via_vocab():
table_name = "test" table_name = "test"
vocab = Vocab() vocab = Vocab()
@ -93,12 +128,11 @@ def test_lookups_to_from_bytes_via_vocab():
assert table_name in new_vocab.lookups assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name) table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2 assert len(table) == 2
assert table.get("hello") == "world" assert table["hello"] == "world"
assert new_vocab.to_bytes() == vocab_bytes assert new_vocab.to_bytes() == vocab_bytes
# This fails on Python 3.5 @pytest.mark.skip(reason="This fails on Python 3.5")
@pytest.mark.xfail
def test_lookups_to_from_disk_via_vocab(): def test_lookups_to_from_disk_via_vocab():
table_name = "test" table_name = "test"
vocab = Vocab() vocab = Vocab()
@ -113,4 +147,4 @@ def test_lookups_to_from_disk_via_vocab():
assert table_name in new_vocab.lookups assert table_name in new_vocab.lookups
table = new_vocab.lookups.get_table(table_name) table = new_vocab.lookups.get_table(table_name)
assert len(table) == 2 assert len(table) == 2
assert table.get("hello") == "world" assert table["hello"] == "world"

View File

@ -259,7 +259,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
def test_vocab_add_vector(): def test_vocab_add_vector():
vocab = Vocab() vocab = Vocab(vectors_name="test_vocab_add_vector")
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -272,7 +272,7 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors(): def test_vocab_prune_vectors():
vocab = Vocab() vocab = Vocab(vectors_name="test_vocab_prune_vectors")
_ = vocab["cat"] # noqa: F841 _ = vocab["cat"] # noqa: F841
_ = vocab["dog"] # noqa: F841 _ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] # noqa: F841 _ = vocab["kitten"] # noqa: F841

View File

@ -4,5 +4,6 @@ from __future__ import unicode_literals
from .doc import Doc from .doc import Doc
from .token import Token from .token import Token
from .span import Span from .span import Span
from ._serialize import DocBin
__all__ = ["Doc", "Token", "Span"] __all__ = ["Doc", "Token", "Span", "DocBin"]

View File

@ -8,36 +8,77 @@ from thinc.neural.ops import NumpyOps
from ..compat import copy_reg from ..compat import copy_reg
from ..tokens import Doc from ..tokens import Doc
from ..attrs import SPACY, ORTH from ..attrs import SPACY, ORTH, intify_attrs
from ..errors import Errors
class DocBox(object): class DocBin(object):
"""Serialize analyses from a collection of doc objects.""" """Pack Doc objects for binary serialization.
The DocBin class lets you efficiently serialize the information from a
collection of Doc objects. You can control which information is serialized
by passing a list of attribute IDs, and optionally also specify whether the
user data is serialized. The DocBin is faster and produces smaller data
sizes than pickle, and allows you to deserialize without executing arbitrary
Python code.
The serialization format is gzipped msgpack, where the msgpack object has
the following structure:
{
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data
}
Strings for the words, tags, labels etc are represented by 64-bit hashes in
the token data, and every string that occurs at least once is passed via the
strings object. This means the storage is more efficient if you pack more
documents together, because you have less duplication in the strings.
A notable downside to this format is that you can't easily extract just one
document from the DocBin.
"""
def __init__(self, attrs=None, store_user_data=False): def __init__(self, attrs=None, store_user_data=False):
"""Create a DocBox object, to hold serialized annotations. """Create a DocBin object to hold serialized annotations.
attrs (list): List of attributes to serialize. 'orth' and 'spacy' are attrs (list): List of attributes to serialize. 'orth' and 'spacy' are
always serialized, so they're not required. Defaults to None. always serialized, so they're not required. Defaults to None.
store_user_data (bool): Whether to include the `Doc.user_data`.
RETURNS (DocBin): The newly constructed object.
DOCS: https://spacy.io/api/docbin#init
""" """
attrs = attrs or [] attrs = attrs or []
# Ensure ORTH is always attrs[0] attrs = sorted(intify_attrs(attrs))
self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
self.attrs.insert(0, ORTH) self.attrs.insert(0, ORTH) # Ensure ORTH is always attrs[0]
self.tokens = [] self.tokens = []
self.spaces = [] self.spaces = []
self.user_data = [] self.user_data = []
self.strings = set() self.strings = set()
self.store_user_data = store_user_data self.store_user_data = store_user_data
def __len__(self):
"""RETURNS: The number of Doc objects added to the DocBin."""
return len(self.tokens)
def add(self, doc): def add(self, doc):
"""Add a doc's annotations to the DocBox for serialization.""" """Add a Doc's annotations to the DocBin for serialization.
doc (Doc): The Doc object to add.
DOCS: https://spacy.io/api/docbin#add
"""
array = doc.to_array(self.attrs) array = doc.to_array(self.attrs)
if len(array.shape) == 1: if len(array.shape) == 1:
array = array.reshape((array.shape[0], 1)) array = array.reshape((array.shape[0], 1))
self.tokens.append(array) self.tokens.append(array)
spaces = doc.to_array(SPACY) spaces = doc.to_array(SPACY)
assert array.shape[0] == spaces.shape[0] assert array.shape[0] == spaces.shape[0] # this should never happen
spaces = spaces.reshape((spaces.shape[0], 1)) spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.strings.update(w.text for w in doc) self.strings.update(w.text for w in doc)
@ -45,7 +86,13 @@ class DocBox(object):
self.user_data.append(srsly.msgpack_dumps(doc.user_data)) self.user_data.append(srsly.msgpack_dumps(doc.user_data))
def get_docs(self, vocab): def get_docs(self, vocab):
"""Recover Doc objects from the annotations, using the given vocab.""" """Recover Doc objects from the annotations, using the given vocab.
vocab (Vocab): The shared vocab.
YIELDS (Doc): The Doc objects.
DOCS: https://spacy.io/api/docbin#get_docs
"""
for string in self.strings: for string in self.strings:
vocab[string] vocab[string]
orth_col = self.attrs.index(ORTH) orth_col = self.attrs.index(ORTH)
@ -60,8 +107,16 @@ class DocBox(object):
yield doc yield doc
def merge(self, other): def merge(self, other):
"""Extend the annotations of this DocBox with the annotations from another.""" """Extend the annotations of this DocBin with the annotations from
assert self.attrs == other.attrs another. Will raise an error if the pre-defined attrs of the two
DocBins don't match.
other (DocBin): The DocBin to merge into the current bin.
DOCS: https://spacy.io/api/docbin#merge
"""
if self.attrs != other.attrs:
raise ValueError(Errors.E166.format(current=self.attrs, other=other.attrs))
self.tokens.extend(other.tokens) self.tokens.extend(other.tokens)
self.spaces.extend(other.spaces) self.spaces.extend(other.spaces)
self.strings.update(other.strings) self.strings.update(other.strings)
@ -69,9 +124,14 @@ class DocBox(object):
self.user_data.extend(other.user_data) self.user_data.extend(other.user_data)
def to_bytes(self): def to_bytes(self):
"""Serialize the DocBox's annotations into a byte string.""" """Serialize the DocBin's annotations to a bytestring.
RETURNS (bytes): The serialized DocBin.
DOCS: https://spacy.io/api/docbin#to_bytes
"""
for tokens in self.tokens: for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape assert len(tokens.shape) == 2, tokens.shape # this should never happen
lengths = [len(tokens) for tokens in self.tokens] lengths = [len(tokens) for tokens in self.tokens]
msg = { msg = {
"attrs": self.attrs, "attrs": self.attrs,
@ -84,9 +144,15 @@ class DocBox(object):
msg["user_data"] = self.user_data msg["user_data"] = self.user_data
return gzip.compress(srsly.msgpack_dumps(msg)) return gzip.compress(srsly.msgpack_dumps(msg))
def from_bytes(self, string): def from_bytes(self, bytes_data):
"""Deserialize the DocBox's annotations from a byte string.""" """Deserialize the DocBin's annotations from a bytestring.
msg = srsly.msgpack_loads(gzip.decompress(string))
bytes_data (bytes): The data to load from.
RETURNS (DocBin): The loaded DocBin.
DOCS: https://spacy.io/api/docbin#from_bytes
"""
msg = srsly.msgpack_loads(gzip.decompress(bytes_data))
self.attrs = msg["attrs"] self.attrs = msg["attrs"]
self.strings = set(msg["strings"]) self.strings = set(msg["strings"])
lengths = numpy.fromstring(msg["lengths"], dtype="int32") lengths = numpy.fromstring(msg["lengths"], dtype="int32")
@ -100,35 +166,35 @@ class DocBox(object):
if self.store_user_data and "user_data" in msg: if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"]) self.user_data = list(msg["user_data"])
for tokens in self.tokens: for tokens in self.tokens:
assert len(tokens.shape) == 2, tokens.shape assert len(tokens.shape) == 2, tokens.shape # this should never happen
return self return self
def merge_boxes(boxes): def merge_bins(bins):
merged = None merged = None
for byte_string in boxes: for byte_string in bins:
if byte_string is not None: if byte_string is not None:
box = DocBox(store_user_data=True).from_bytes(byte_string) doc_bin = DocBin(store_user_data=True).from_bytes(byte_string)
if merged is None: if merged is None:
merged = box merged = doc_bin
else: else:
merged.merge(box) merged.merge(doc_bin)
if merged is not None: if merged is not None:
return merged.to_bytes() return merged.to_bytes()
else: else:
return b"" return b""
def pickle_box(box): def pickle_bin(doc_bin):
return (unpickle_box, (box.to_bytes(),)) return (unpickle_bin, (doc_bin.to_bytes(),))
def unpickle_box(byte_string): def unpickle_bin(byte_string):
return DocBox().from_bytes(byte_string) return DocBin().from_bytes(byte_string)
copy_reg.pickle(DocBox, pickle_box, unpickle_box) copy_reg.pickle(DocBin, pickle_bin, unpickle_bin)
# Compatibility, as we had named it this previously. # Compatibility, as we had named it this previously.
Binder = DocBox Binder = DocBin
__all__ = ["DocBox"] __all__ = ["DocBin"]

View File

@ -1091,6 +1091,37 @@ cdef class Doc:
data["_"][attr] = value data["_"][attr] = value
return data return data
def to_utf8_array(self, int nr_char=-1):
"""Encode word strings to utf8, and export to a fixed-width array
of characters. Characters are placed into the array in the order:
0, -1, 1, -2, etc
For example, if the array is sliced array[:, :8], the array will
contain the first 4 characters and last 4 characters of each word ---
with the middle characters clipped out. The value 255 is used as a pad
value.
"""
byte_strings = [token.orth_.encode('utf8') for token in self]
if nr_char == -1:
nr_char = max(len(bs) for bs in byte_strings)
cdef np.ndarray output = numpy.zeros((len(byte_strings), nr_char), dtype='uint8')
output.fill(255)
cdef int i, j, start_idx, end_idx
cdef bytes byte_string
cdef unsigned char utf8_char
for i, byte_string in enumerate(byte_strings):
j = 0
start_idx = 0
end_idx = len(byte_string) - 1
while j < nr_char and start_idx <= end_idx:
output[i, j] = <unsigned char>byte_string[start_idx]
start_idx += 1
j += 1
if j < nr_char and start_idx <= end_idx:
output[i, j] = <unsigned char>byte_string[end_idx]
end_idx -= 1
j += 1
return output
cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2: cdef int token_by_start(const TokenC* tokens, int length, int start_char) except -2:
cdef int i cdef int i

View File

@ -0,0 +1,9 @@
from ..vocab cimport Vocab
from ..typedefs cimport hash_t
from ..structs cimport MorphAnalysisC
cdef class MorphAnalysis:
cdef readonly Vocab vocab
cdef hash_t key
cdef MorphAnalysisC c

View File

@ -0,0 +1,423 @@
from libc.string cimport memset
from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_field, tag_to_json
from ..strings import get_string_id
cdef class MorphAnalysis:
"""Control access to morphological features for a token."""
def __init__(self, Vocab vocab, features=tuple()):
self.vocab = vocab
self.key = self.vocab.morphology.add(features)
analysis = <const MorphAnalysisC*>self.vocab.morphology.tags.get(self.key)
if analysis is not NULL:
self.c = analysis[0]
else:
memset(&self.c, 0, sizeof(self.c))
@classmethod
def from_id(cls, Vocab vocab, hash_t key):
"""Create a morphological analysis from a given ID."""
cdef MorphAnalysis morph = MorphAnalysis.__new__(MorphAnalysis, vocab)
morph.vocab = vocab
morph.key = key
analysis = <const MorphAnalysisC*>vocab.morphology.tags.get(key)
if analysis is not NULL:
morph.c = analysis[0]
else:
memset(&morph.c, 0, sizeof(morph.c))
return morph
def __contains__(self, feature):
"""Test whether the morphological analysis contains some feature."""
cdef attr_t feat_id = get_string_id(feature)
return check_feature(&self.c, feat_id)
def __iter__(self):
"""Iterate over the features in the analysis."""
cdef attr_t feature
for feature in list_features(&self.c):
yield self.vocab.strings[feature]
def __len__(self):
"""The number of features in the analysis."""
return self.c.length
def __str__(self):
return self.to_json()
def __repr__(self):
return self.to_json()
def __hash__(self):
return self.key
def get(self, unicode field):
"""Retrieve a feature by field."""
cdef int field_id = self.vocab.morphology._feat_map.attr2field[field]
return self.vocab.strings[get_field(&self.c, field_id)]
def to_json(self):
"""Produce a json serializable representation, which will be a list of
strings.
"""
return tag_to_json(&self.c)
@property
def is_base_form(self):
raise NotImplementedError
@property
def pos(self):
return self.c.pos
@property
def pos_(self):
return self.vocab.strings[self.c.pos]
property id:
def __get__(self):
return self.key
property abbr:
def __get__(self):
return self.c.abbr
property adp_type:
def __get__(self):
return self.c.adp_type
property adv_type:
def __get__(self):
return self.c.adv_type
property animacy:
def __get__(self):
return self.c.animacy
property aspect:
def __get__(self):
return self.c.aspect
property case:
def __get__(self):
return self.c.case
property conj_type:
def __get__(self):
return self.c.conj_type
property connegative:
def __get__(self):
return self.c.connegative
property definite:
def __get__(self):
return self.c.definite
property degree:
def __get__(self):
return self.c.degree
property derivation:
def __get__(self):
return self.c.derivation
property echo:
def __get__(self):
return self.c.echo
property foreign:
def __get__(self):
return self.c.foreign
property gender:
def __get__(self):
return self.c.gender
property hyph:
def __get__(self):
return self.c.hyph
property inf_form:
def __get__(self):
return self.c.inf_form
property mood:
def __get__(self):
return self.c.mood
property name_type:
def __get__(self):
return self.c.name_type
property negative:
def __get__(self):
return self.c.negative
property noun_type:
def __get__(self):
return self.c.noun_type
property number:
def __get__(self):
return self.c.number
property num_form:
def __get__(self):
return self.c.num_form
property num_type:
def __get__(self):
return self.c.num_type
property num_value:
def __get__(self):
return self.c.num_value
property part_form:
def __get__(self):
return self.c.part_form
property part_type:
def __get__(self):
return self.c.part_type
property person:
def __get__(self):
return self.c.person
property polite:
def __get__(self):
return self.c.polite
property polarity:
def __get__(self):
return self.c.polarity
property poss:
def __get__(self):
return self.c.poss
property prefix:
def __get__(self):
return self.c.prefix
property prep_case:
def __get__(self):
return self.c.prep_case
property pron_type:
def __get__(self):
return self.c.pron_type
property punct_side:
def __get__(self):
return self.c.punct_side
property punct_type:
def __get__(self):
return self.c.punct_type
property reflex:
def __get__(self):
return self.c.reflex
property style:
def __get__(self):
return self.c.style
property style_variant:
def __get__(self):
return self.c.style_variant
property tense:
def __get__(self):
return self.c.tense
property typo:
def __get__(self):
return self.c.typo
property verb_form:
def __get__(self):
return self.c.verb_form
property voice:
def __get__(self):
return self.c.voice
property verb_type:
def __get__(self):
return self.c.verb_type
property abbr_:
def __get__(self):
return self.vocab.strings[self.c.abbr]
property adp_type_:
def __get__(self):
return self.vocab.strings[self.c.adp_type]
property adv_type_:
def __get__(self):
return self.vocab.strings[self.c.adv_type]
property animacy_:
def __get__(self):
return self.vocab.strings[self.c.animacy]
property aspect_:
def __get__(self):
return self.vocab.strings[self.c.aspect]
property case_:
def __get__(self):
return self.vocab.strings[self.c.case]
property conj_type_:
def __get__(self):
return self.vocab.strings[self.c.conj_type]
property connegative_:
def __get__(self):
return self.vocab.strings[self.c.connegative]
property definite_:
def __get__(self):
return self.vocab.strings[self.c.definite]
property degree_:
def __get__(self):
return self.vocab.strings[self.c.degree]
property derivation_:
def __get__(self):
return self.vocab.strings[self.c.derivation]
property echo_:
def __get__(self):
return self.vocab.strings[self.c.echo]
property foreign_:
def __get__(self):
return self.vocab.strings[self.c.foreign]
property gender_:
def __get__(self):
return self.vocab.strings[self.c.gender]
property hyph_:
def __get__(self):
return self.vocab.strings[self.c.hyph]
property inf_form_:
def __get__(self):
return self.vocab.strings[self.c.inf_form]
property name_type_:
def __get__(self):
return self.vocab.strings[self.c.name_type]
property negative_:
def __get__(self):
return self.vocab.strings[self.c.negative]
property mood_:
def __get__(self):
return self.vocab.strings[self.c.mood]
property number_:
def __get__(self):
return self.vocab.strings[self.c.number]
property num_form_:
def __get__(self):
return self.vocab.strings[self.c.num_form]
property num_type_:
def __get__(self):
return self.vocab.strings[self.c.num_type]
property num_value_:
def __get__(self):
return self.vocab.strings[self.c.num_value]
property part_form_:
def __get__(self):
return self.vocab.strings[self.c.part_form]
property part_type_:
def __get__(self):
return self.vocab.strings[self.c.part_type]
property person_:
def __get__(self):
return self.vocab.strings[self.c.person]
property polite_:
def __get__(self):
return self.vocab.strings[self.c.polite]
property polarity_:
def __get__(self):
return self.vocab.strings[self.c.polarity]
property poss_:
def __get__(self):
return self.vocab.strings[self.c.poss]
property prefix_:
def __get__(self):
return self.vocab.strings[self.c.prefix]
property prep_case_:
def __get__(self):
return self.vocab.strings[self.c.prep_case]
property pron_type_:
def __get__(self):
return self.vocab.strings[self.c.pron_type]
property punct_side_:
def __get__(self):
return self.vocab.strings[self.c.punct_side]
property punct_type_:
def __get__(self):
return self.vocab.strings[self.c.punct_type]
property reflex_:
def __get__(self):
return self.vocab.strings[self.c.reflex]
property style_:
def __get__(self):
return self.vocab.strings[self.c.style]
property style_variant_:
def __get__(self):
return self.vocab.strings[self.c.style_variant]
property tense_:
def __get__(self):
return self.vocab.strings[self.c.tense]
property typo_:
def __get__(self):
return self.vocab.strings[self.c.typo]
property verb_form_:
def __get__(self):
return self.vocab.strings[self.c.verb_form]
property voice_:
def __get__(self):
return self.vocab.strings[self.c.voice]
property verb_type_:
def __get__(self):
return self.vocab.strings[self.c.verb_type]

View File

@ -26,6 +26,7 @@ from .. import util
from ..compat import is_config from ..compat import is_config
from ..errors import Errors, Warnings, user_warning, models_warning from ..errors import Errors, Warnings, user_warning, models_warning
from .underscore import Underscore, get_ext_args from .underscore import Underscore, get_ext_args
from .morphanalysis cimport MorphAnalysis
cdef class Token: cdef class Token:
@ -218,6 +219,10 @@ cdef class Token:
xp = get_array_module(vector) xp = get_array_module(vector)
return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)) return (xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm))
@property
def morph(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
@property @property
def lex_id(self): def lex_id(self):
"""RETURNS (int): Sequential ID of the token's lexical type.""" """RETURNS (int): Sequential ID of the token's lexical type."""
@ -330,7 +335,7 @@ cdef class Token:
""" """
def __get__(self): def __get__(self):
if self.c.lemma == 0: if self.c.lemma == 0:
lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_) lemma_ = self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
return self.vocab.strings[lemma_] return self.vocab.strings[lemma_]
else: else:
return self.c.lemma return self.c.lemma
@ -858,7 +863,7 @@ cdef class Token:
""" """
def __get__(self): def __get__(self):
if self.c.lemma == 0: if self.c.lemma == 0:
return self.vocab.morphology.lemmatizer.lookup(self.orth_) return self.vocab.morphology.lemmatizer.lookup(self.orth_, orth=self.orth)
else: else:
return self.vocab.strings[self.c.lemma] return self.vocab.strings[self.c.lemma]

View File

@ -18,10 +18,10 @@ from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_ from .compat import copy_reg, basestring_
from .errors import Errors from .errors import Errors
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .lookups import Lookups
from .attrs import intify_attrs, NORM from .attrs import intify_attrs, NORM
from .vectors import Vectors from .vectors import Vectors
from ._ml import link_vectors_to_models from ._ml import link_vectors_to_models
from .lookups import Lookups
from . import util from . import util
@ -33,7 +33,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None,
strings=tuple(), lookups=None, oov_prob=-20., **deprecated_kwargs): strings=tuple(), lookups=None, oov_prob=-20., vectors_name=None,
**deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -62,7 +63,7 @@ cdef class Vocab:
_ = self[string] _ = self[string]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings, tag_map, lemmatizer) self.morphology = Morphology(self.strings, tag_map, lemmatizer)
self.vectors = Vectors() self.vectors = Vectors(name=vectors_name)
self.lookups = lookups self.lookups = lookups
@property @property
@ -318,7 +319,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
self.vectors = Vectors(data=keep, keys=keys) self.vectors = Vectors(data=keep, keys=keys, name=self.vectors.name)
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
remap = {} remap = {}
for i, key in enumerate(keys[nr_row:]): for i, key in enumerate(keys[nr_row:]):

View File

@ -309,7 +309,7 @@ indented block as plain text and preserve whitespace.
### Using spaCy ### Using spaCy
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
for token in doc: for token in doc:
print(token.text, token.pos_) print(token.text, token.pos_)
``` ```
@ -335,9 +335,9 @@ from spacy.matcher import Matcher
nlp = spacy.load('en_core_web_sm') nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab) matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'hello'}, {'IS_PUNCT': True}, {'LOWER': 'world'}] pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add('HelloWorld', None, pattern) matcher.add("HelloWorld", None, pattern)
doc = nlp(u'Hello, world! Hello world!') doc = nlp("Hello, world! Hello world!")
matches = matcher(doc) matches = matcher(doc)
``` ```
@ -360,7 +360,7 @@ interactive widget defaults to a regular code block.
### {executable="true"} ### {executable="true"}
import spacy import spacy
nlp = spacy.load("en_core_web_sm") nlp = spacy.load("en_core_web_sm")
doc = nlp(u"This is a sentence.") doc = nlp("This is a sentence.")
for token in doc: for token in doc:
print(token.text, token.pos_) print(token.text, token.pos_)
``` ```
@ -457,7 +457,8 @@ sit amet dignissim justo congue.
## Setup and installation {#setup} ## Setup and installation {#setup}
Before running the setup, make sure your versions of Before running the setup, make sure your versions of
[Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date. Node v10.15 or later is required. [Node](https://nodejs.org/en/) and [npm](https://www.npmjs.com/) are up to date.
Node v10.15 or later is required.
```bash ```bash
# Clone the repository # Clone the repository

View File

@ -16,7 +16,7 @@ menu:
> ```python > ```python
> from spacy.lang.en import English > from spacy.lang.en import English
> nlp = English() > nlp = English()
> tokens = nlp(u"Some\\nspaces and\\ttab characters") > tokens = nlp("Some\\nspaces and\\ttab characters")
> tokens_text = [t.text for t in tokens] > tokens_text = [t.text for t in tokens]
> assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"] > assert tokens_text == ["Some", "\\n", "spaces", " ", "and", "\\t", "tab", "characters"]
> ``` > ```
@ -80,8 +80,8 @@ training corpus and can be defined in the respective language data's
<Accordion title="Universal Part-of-speech Tags" id="pos-universal"> <Accordion title="Universal Part-of-speech Tags" id="pos-universal">
spaCy also maps all language-specific part-of-speech tags to a small, fixed set spaCy maps all language-specific part-of-speech tags to a small, fixed set of
of word type tags following the word type tags following the
[Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The [Universal Dependencies scheme](http://universaldependencies.org/u/pos/). The
universal tags don't code for any morphological features and only cover the word universal tags don't code for any morphological features and only cover the word
type. They're available as the [`Token.pos`](/api/token#attributes) and type. They're available as the [`Token.pos`](/api/token#attributes) and
@ -552,6 +552,10 @@ spaCy's JSON format, you can use the
"last": int, # index of last token "last": int, # index of last token
"label": string # phrase label "label": string # phrase label
}] }]
}],
"cats": [{ # new in v2.2: categories for text classifier
"label": string, # text category label
"value": float / bool # label applies (1.0/true) or not (0.0/false)
}] }]
}] }]
}] }]

View File

@ -8,6 +8,7 @@ menu:
- ['Info', 'info'] - ['Info', 'info']
- ['Validate', 'validate'] - ['Validate', 'validate']
- ['Convert', 'convert'] - ['Convert', 'convert']
- ['Debug data', 'debug-data']
- ['Train', 'train'] - ['Train', 'train']
- ['Pretrain', 'pretrain'] - ['Pretrain', 'pretrain']
- ['Init Model', 'init-model'] - ['Init Model', 'init-model']
@ -22,11 +23,11 @@ type `spacy --help`.
## Download {#download} ## Download {#download}
Download [models](/usage/models) for spaCy. The downloader finds the Download [models](/usage/models) for spaCy. The downloader finds the
best-matching compatible version, uses pip to download the model as a package best-matching compatible version, uses `pip install` to download the model as a
and automatically creates a [shortcut link](/usage/models#usage) to load the package and creates a [shortcut link](/usage/models#usage) if the model was
model by name. Direct downloads don't perform any compatibility checks and downloaded via a shortcut. Direct downloads don't perform any compatibility
require the model name to be specified with its version (e.g. checks and require the model name to be specified with its version (e.g.
`en_core_web_sm-2.0.0`). `en_core_web_sm-2.2.0`).
> #### Downloading best practices > #### Downloading best practices
> >
@ -39,16 +40,16 @@ require the model name to be specified with its version (e.g.
> also allow you to add it as a versioned package dependency to your project. > also allow you to add it as a versioned package dependency to your project.
```bash ```bash
$ python -m spacy download [model] [--direct] $ python -m spacy download [model] [--direct] [pip args]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ---------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). | | `model` | positional | Model name or shortcut (`en`, `de`, `en_core_web_sm`). |
| `--direct`, `-d` | flag | Force direct download of exact model version. | | `--direct`, `-d` | flag | Force direct download of exact model version. |
| other <Tag variant="new">2.1</Tag> | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory. | | pip args <Tag variant="new">2.1</Tag> | - | Additional installation options to be passed to `pip install` when installing the model package. For example, `--user` to install to the user home directory or `--no-deps` to not install model dependencies. |
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data`. | | **CREATES** | directory, symlink | The installed model package in your `site-packages` directory and a shortcut link as a symlink in `spacy/data` if installed via shortcut. |
## Link {#link} ## Link {#link}
@ -180,6 +181,165 @@ All output files generated by this command are compatible with
| `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. | | `conll`, `conllu`, `conllubio` | Universal Dependencies `.conllu` or `.conll` format. |
| `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `ner` | NER with IOB/IOB2 tags, one token per line with columns separated by whitespace. The first column is the token and the final column is the IOB tag. Sentences are separated by blank lines and documents are separated by the line `-DOCSTART- -X- O O`. Supports CoNLL 2003 NER format. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
| `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). | | `iob` | NER with IOB/IOB2 tags, one sentence per line with tokens separated by whitespace and annotation separated by `|`, either `word|B-ENT` or `word|POS|B-ENT`. See [sample data](https://github.com/explosion/spaCy/tree/master/examples/training/ner_example_data). |
## Debug data {#debug-data new="2.2"}
Analyze, debug and validate your training and development data, get useful
stats, and find problems like invalid entity annotations, cyclic dependencies,
low data labels and more.
```bash
$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format]
```
| Argument | Type | Description |
| -------------------------- | ---------- | -------------------------------------------------------------------------------------------------- |
| `lang` | positional | Model language. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. |
| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. |
| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. |
| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. |
| `--verbose`, `-V` | flag | Print additional information and explanations. |
| --no-format, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. |
<Accordion title="Example output">
```
=========================== Data format validation ===========================
✔ Corpus is loadable
=============================== Training stats ===============================
Training pipeline: tagger, parser, ner
Starting with blank model 'en'
18127 training docs
2939 evaluation docs
⚠ 34 training examples also in evaluation data
============================== Vocab & Vectors ==============================
2083156 total words in the data (56962 unique)
⚠ 13020 misaligned tokens in the training data
⚠ 2423 misaligned tokens in the dev data
10 most common words: 'the' (98429), ',' (91756), '.' (87073), 'to' (50058),
'of' (49559), 'and' (44416), 'a' (34010), 'in' (31424), 'that' (22792), 'is'
(18952)
No word vectors present in the model
========================== Named Entity Recognition ==========================
18 new labels, 0 existing labels
528978 missing values (tokens with '-' label)
New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
(10490), 'NORP' (9033), 'MONEY' (5164), 'PERCENT' (3761), 'ORDINAL' (2122),
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
✔ Good amount of examples for all labels
✔ Examples without occurences available for all labels
✔ No entities consisting of or starting/ending with whitespace
=========================== Part-of-speech Tagging ===========================
49 labels in data (57 labels in tag map)
'NN' (266331), 'IN' (227365), 'DT' (185600), 'NNP' (164404), 'JJ' (119830),
'NNS' (110957), '.' (101482), ',' (92476), 'RB' (90090), 'PRP' (90081), 'VB'
(74538), 'VBD' (68199), 'CC' (62862), 'VBZ' (50712), 'VBP' (43420), 'VBN'
(42193), 'CD' (40326), 'VBG' (34764), 'TO' (31085), 'MD' (25863), 'PRP$'
(23335), 'HYPH' (13833), 'POS' (13427), 'UH' (13322), 'WP' (10423), 'WDT'
(9850), 'RP' (8230), 'WRB' (8201), ':' (8168), '''' (7392), '``' (6984), 'NNPS'
(5817), 'JJR' (5689), '$' (3710), 'EX' (3465), 'JJS' (3118), 'RBR' (2872),
'-RRB-' (2825), '-LRB-' (2788), 'PDT' (2078), 'XX' (1316), 'RBS' (1142), 'FW'
(794), 'NFP' (557), 'SYM' (440), 'WP$' (294), 'LS' (293), 'ADD' (191), 'AFX'
(24)
✔ All labels present in tag map for language 'en'
============================= Dependency Parsing =============================
Found 111703 sentences with an average length of 18.6 words.
Found 2251 nonprojective train sentences
Found 303 nonprojective dev sentences
47 labels in train data
211 labels in projectivized train data
'punct' (236796), 'prep' (188853), 'pobj' (182533), 'det' (172674), 'nsubj'
(169481), 'compound' (116142), 'ROOT' (111697), 'amod' (107945), 'dobj' (93540),
'aux' (86802), 'advmod' (86197), 'cc' (62679), 'conj' (59575), 'poss' (36449),
'ccomp' (36343), 'advcl' (29017), 'mark' (27990), 'nummod' (24582), 'relcl'
(21359), 'xcomp' (21081), 'attr' (18347), 'npadvmod' (17740), 'acomp' (17204),
'auxpass' (15639), 'appos' (15368), 'neg' (15266), 'nsubjpass' (13922), 'case'
(13408), 'acl' (12574), 'pcomp' (10340), 'nmod' (9736), 'intj' (9285), 'prt'
(8196), 'quantmod' (7403), 'dep' (4300), 'dative' (4091), 'agent' (3908), 'expl'
(3456), 'parataxis' (3099), 'oprd' (2326), 'predet' (1946), 'csubj' (1494),
'subtok' (1147), 'preconj' (692), 'meta' (469), 'csubjpass' (64), 'iobj' (1)
⚠ Low number of examples for label 'iobj' (1)
⚠ Low number of examples for 130 labels in the projectivized dependency
trees used for training. You may want to projectivize labels such as punct
before training in order to improve parser performance.
⚠ Projectivized labels with low numbers of examples: appos||attr: 12
advmod||dobj: 13 prep||ccomp: 12 nsubjpass||ccomp: 15 pcomp||prep: 14
amod||dobj: 9 attr||xcomp: 14 nmod||nsubj: 17 prep||advcl: 2 prep||prep: 5
nsubj||conj: 12 advcl||advmod: 18 ccomp||advmod: 11 ccomp||pcomp: 5 acl||pobj:
10 npadvmod||acomp: 7 dobj||pcomp: 14 nsubjpass||pcomp: 1 nmod||pobj: 8
amod||attr: 6 nmod||dobj: 12 aux||conj: 1 neg||conj: 1 dative||xcomp: 11
pobj||dative: 3 xcomp||acomp: 19 advcl||pobj: 2 nsubj||advcl: 2 csubj||ccomp: 1
advcl||acl: 1 relcl||nmod: 2 dobj||advcl: 10 advmod||advcl: 3 nmod||nsubjpass: 6
amod||pobj: 5 cc||neg: 1 attr||ccomp: 16 advcl||xcomp: 3 nmod||attr: 4
advcl||nsubjpass: 5 advcl||ccomp: 4 ccomp||conj: 1 punct||acl: 1 meta||acl: 1
parataxis||acl: 1 prep||acl: 1 amod||nsubj: 7 ccomp||ccomp: 3 acomp||xcomp: 5
dobj||acl: 5 prep||oprd: 6 advmod||acl: 2 dative||advcl: 1 pobj||agent: 5
xcomp||amod: 1 dep||advcl: 1 prep||amod: 8 relcl||compound: 1 advcl||csubj: 3
npadvmod||conj: 2 npadvmod||xcomp: 4 advmod||nsubj: 3 ccomp||amod: 7
advcl||conj: 1 nmod||conj: 2 advmod||nsubjpass: 2 dep||xcomp: 2 appos||ccomp: 1
advmod||dep: 1 advmod||advmod: 5 aux||xcomp: 8 dep||advmod: 1 dative||ccomp: 2
prep||dep: 1 conj||conj: 1 dep||ccomp: 4 cc||ROOT: 1 prep||ROOT: 1 nsubj||pcomp:
3 advmod||prep: 2 relcl||dative: 1 acl||conj: 1 advcl||attr: 4 prep||npadvmod: 1
nsubjpass||xcomp: 1 neg||advmod: 1 xcomp||oprd: 1 advcl||advcl: 1 dobj||dep: 3
nsubjpass||parataxis: 1 attr||pcomp: 1 ccomp||parataxis: 1 advmod||attr: 1
nmod||oprd: 1 appos||nmod: 2 advmod||relcl: 1 appos||npadvmod: 1 appos||conj: 1
prep||expl: 1 nsubjpass||conj: 1 punct||pobj: 1 cc||pobj: 1 conj||pobj: 1
punct||conj: 1 ccomp||dep: 1 oprd||xcomp: 3 ccomp||xcomp: 1 ccomp||nsubj: 1
nmod||dep: 1 xcomp||ccomp: 1 acomp||advcl: 1 intj||advmod: 1 advmod||acomp: 2
relcl||oprd: 1 advmod||prt: 1 advmod||pobj: 1 appos||nummod: 1 relcl||npadvmod:
3 mark||advcl: 1 aux||ccomp: 1 amod||nsubjpass: 1 npadvmod||advmod: 1 conj||dep:
1 nummod||pobj: 1 amod||npadvmod: 1 intj||pobj: 1 nummod||npadvmod: 1
xcomp||xcomp: 1 aux||dep: 1 advcl||relcl: 1
⚠ The following labels were found only in the train data: xcomp||amod,
advcl||relcl, prep||nsubjpass, acl||nsubj, nsubjpass||conj, xcomp||oprd,
advmod||conj, advmod||advmod, iobj, advmod||nsubjpass, dobj||conj, ccomp||amod,
meta||acl, xcomp||xcomp, prep||attr, prep||ccomp, advcl||acomp, acl||dobj,
advcl||advcl, pobj||agent, prep||advcl, nsubjpass||xcomp, prep||dep,
acomp||xcomp, aux||ccomp, ccomp||dep, conj||dep, relcl||compound,
nsubjpass||ccomp, nmod||dobj, advmod||advcl, advmod||acl, dobj||advcl,
dative||xcomp, prep||nsubj, ccomp||ccomp, nsubj||ccomp, xcomp||acomp,
prep||acomp, dep||advmod, acl||pobj, appos||dobj, npadvmod||acomp, cc||ROOT,
relcl||nsubj, nmod||pobj, acl||nsubjpass, ccomp||advmod, pcomp||prep,
amod||dobj, advmod||attr, advcl||csubj, appos||attr, dobj||pcomp, prep||ROOT,
relcl||pobj, advmod||pobj, amod||nsubj, ccomp||xcomp, prep||oprd,
npadvmod||advmod, appos||nummod, advcl||pobj, neg||advmod, acl||attr,
appos||nsubjpass, csubj||ccomp, amod||nsubjpass, intj||pobj, dep||advcl,
cc||neg, xcomp||ccomp, dative||ccomp, nmod||oprd, pobj||dative, prep||dobj,
dep||ccomp, relcl||attr, ccomp||nsubj, advcl||xcomp, nmod||dep, advcl||advmod,
ccomp||conj, pobj||prep, advmod||acomp, advmod||relcl, attr||pcomp,
ccomp||parataxis, oprd||xcomp, intj||advmod, nmod||nsubjpass, prep||npadvmod,
parataxis||acl, prep||pobj, advcl||dobj, amod||pobj, prep||acl, conj||pobj,
advmod||dep, punct||pobj, ccomp||acomp, acomp||advcl, nummod||npadvmod,
dobj||dep, npadvmod||xcomp, advcl||conj, relcl||npadvmod, punct||acl,
relcl||dobj, dobj||xcomp, nsubjpass||parataxis, dative||advcl, relcl||nmod,
advcl||ccomp, appos||npadvmod, ccomp||pcomp, prep||amod, mark||advcl,
prep||advmod, prep||xcomp, appos||nsubj, attr||ccomp, advmod||prt, dobj||ccomp,
aux||conj, advcl||nsubj, conj||conj, advmod||ccomp, advcl||nsubjpass,
attr||xcomp, nmod||conj, npadvmod||conj, relcl||dative, prep||expl,
nsubjpass||pcomp, advmod||xcomp, advmod||dobj, appos||pobj, nsubj||conj,
relcl||nsubjpass, advcl||attr, appos||ccomp, advmod||prep, prep||conj,
nmod||attr, punct||conj, neg||conj, dep||xcomp, aux||xcomp, dobj||acl,
nummod||pobj, amod||npadvmod, nsubj||pcomp, advcl||acl, appos||nmod,
relcl||oprd, prep||prep, cc||pobj, nmod||nsubj, amod||attr, aux||dep,
appos||conj, advmod||nsubj, nsubj||advcl, acl||conj
To train a parser, your data should include at least 20 instances of each label.
⚠ Multiple root labels (ROOT, nsubj, aux, npadvmod, prep) found in
training data. spaCy's parser uses a single root label ROOT so this distinction
will not be available.
================================== Summary ==================================
✔ 5 checks passed
⚠ 8 warnings
```
</Accordion>
## Train {#train} ## Train {#train}
@ -200,14 +360,15 @@ will only train the tagger and parser.
```bash ```bash
$ python -m spacy train [lang] [output_path] [train_path] [dev_path] $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] [--n-examples] [--use-gpu] [--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping]
[--version] [--meta-path] [--init-tok2vec] [--parser-multitasks] [--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec]
[--entity-multitasks] [--gold-preproc] [--noise-level] [--learn-tokens] [--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level]
[--learn-tokens] [--textcat-arch] [--textcat-multilabel] [--textcat-positive-label]
[--verbose] [--verbose]
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ----------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | positional | Model language. | | `lang` | positional | Model language. |
| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | | `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. |
| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | | `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. |
@ -227,6 +388,9 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path]
| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | | `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. |
| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | | `--gold-preproc`, `-G` | flag | Use gold preprocessing. |
| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | | `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. |
| `--textcat-multilabel`, `-TML` <Tag variant="new">2.2</Tag> | flag | Text classification classes aren't mutually exclusive (multilabel). |
| `--textcat-arch`, `-ta` <Tag variant="new">2.2</Tag> | option | Text classification model architecture. Defaults to `"bow"`. |
| `--textcat-positive-label`, `-tpl` <Tag variant="new">2.2</Tag> | option | Text classification positive label for binary classes with two labels. |
| `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. | | `--verbose`, `-VV` <Tag variant="new">2.0.13</Tag> | flag | Show more detailed messages during training. |
| `--help`, `-h` | flag | Show help message and available arguments. | | `--help`, `-h` | flag | Show help message and available arguments. |
| **CREATES** | model, pickle | A spaCy model on each epoch. | | **CREATES** | model, pickle | A spaCy model on each epoch. |

View File

@ -45,9 +45,9 @@ Append a token to the `Doc`. The token can be provided as a
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab()) > doc = Doc(Vocab())
> lexeme = doc.vocab.get(u'hello') > lexeme = doc.vocab.get("hello")
> doc.push_back(lexeme, True) > doc.push_back(lexeme, True)
> assert doc.text == u'hello ' > assert doc.text == "hello "
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -164,7 +164,7 @@ vocabulary.
> #### Example > #### Example
> >
> ```python > ```python
> lexeme = vocab.get(vocab.mem, u'hello') > lexeme = vocab.get(vocab.mem, "hello")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |

View File

@ -88,7 +88,7 @@ Find a token in a `TokenC*` array by the offset of its first character.
> from spacy.tokens.doc cimport Doc, token_by_start > from spacy.tokens.doc cimport Doc, token_by_start
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'hello', u'world']) > doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_start(doc.c, doc.length, 6) == 1 > assert token_by_start(doc.c, doc.length, 6) == 1
> assert token_by_start(doc.c, doc.length, 4) == -1 > assert token_by_start(doc.c, doc.length, 4) == -1
> ``` > ```
@ -110,7 +110,7 @@ Find a token in a `TokenC*` array by the offset of its final character.
> from spacy.tokens.doc cimport Doc, token_by_end > from spacy.tokens.doc cimport Doc, token_by_end
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'hello', u'world']) > doc = Doc(Vocab(), words=["hello", "world"])
> assert token_by_end(doc.c, doc.length, 5) == 0 > assert token_by_end(doc.c, doc.length, 5) == 0
> assert token_by_end(doc.c, doc.length, 1) == -1 > assert token_by_end(doc.c, doc.length, 1) == -1
> ``` > ```
@ -134,7 +134,7 @@ attribute, in order to make the parse tree navigation consistent.
> from spacy.tokens.doc cimport Doc, set_children_from_heads > from spacy.tokens.doc cimport Doc, set_children_from_heads
> from spacy.vocab cimport Vocab > from spacy.vocab cimport Vocab
> >
> doc = Doc(Vocab(), words=[u'Baileys', u'from', u'a', u'shoe']) > doc = Doc(Vocab(), words=["Baileys", "from", "a", "shoe"])
> doc.c[0].head = 0 > doc.c[0].head = 0
> doc.c[1].head = 0 > doc.c[1].head = 0
> doc.c[2].head = 3 > doc.c[2].head = 3

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
> >
> ```python > ```python
> parser = DependencyParser(nlp.vocab) > parser = DependencyParser(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = parser(doc) > processed = parser(doc)
> ``` > ```

View File

@ -20,11 +20,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> >
> ```python > ```python
> # Construction 1 > # Construction 1
> doc = nlp(u"Some text") > doc = nlp("Some text")
> >
> # Construction 2 > # Construction 2
> from spacy.tokens import Doc > from spacy.tokens import Doc
> words = [u"hello", u"world", u"!"] > words = ["hello", "world", "!"]
> spaces = [True, False, False] > spaces = [True, False, False]
> doc = Doc(nlp.vocab, words=words, spaces=spaces) > doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ``` > ```
@ -45,7 +45,7 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> assert doc[0].text == "Give" > assert doc[0].text == "Give"
> assert doc[-1].text == "." > assert doc[-1].text == "."
> span = doc[1:3] > span = doc[1:3]
@ -76,8 +76,8 @@ Iterate over `Token` objects, from which the annotations can be easily accessed.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u'Give it back') > doc = nlp("Give it back")
> assert [t.text for t in doc] == [u'Give', u'it', u'back'] > assert [t.text for t in doc] == ["Give", "it", "back"]
> ``` > ```
This is the main way of accessing [`Token`](/api/token) objects, which are the This is the main way of accessing [`Token`](/api/token) objects, which are the
@ -96,7 +96,7 @@ Get the number of tokens in the document.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> assert len(doc) == 7 > assert len(doc) == 7
> ``` > ```
@ -114,9 +114,9 @@ details, see the documentation on
> >
> ```python > ```python
> from spacy.tokens import Doc > from spacy.tokens import Doc
> city_getter = lambda doc: any(city in doc.text for city in ('New York', 'Paris', 'Berlin')) > city_getter = lambda doc: any(city in doc.text for city in ("New York", "Paris", "Berlin"))
> Doc.set_extension('has_city', getter=city_getter) > Doc.set_extension("has_city", getter=city_getter)
> doc = nlp(u'I like New York') > doc = nlp("I like New York")
> assert doc._.has_city > assert doc._.has_city
> ``` > ```
@ -192,8 +192,8 @@ the character indices don't map to a valid span.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like New York") > doc = nlp("I like New York")
> span = doc.char_span(7, 15, label=u"GPE") > span = doc.char_span(7, 15, label="GPE")
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
@ -213,8 +213,8 @@ using an average of word vectors.
> #### Example > #### Example
> >
> ```python > ```python
> apples = nlp(u"I like apples") > apples = nlp("I like apples")
> oranges = nlp(u"I like oranges") > oranges = nlp("I like oranges")
> apples_oranges = apples.similarity(oranges) > apples_oranges = apples.similarity(oranges)
> oranges_apples = oranges.similarity(apples) > oranges_apples = oranges.similarity(apples)
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
@ -235,7 +235,7 @@ attribute ID.
> >
> ```python > ```python
> from spacy.attrs import ORTH > from spacy.attrs import ORTH
> doc = nlp(u"apple apple orange banana") > doc = nlp("apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
> doc.to_array([ORTH]) > doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]]) > # array([[11880], [11880], [7561], [12800]])
@ -255,7 +255,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a test") > doc = nlp("This is a test")
> matrix = doc.get_lca_matrix() > matrix = doc.get_lca_matrix()
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
> ``` > ```
@ -274,7 +274,7 @@ They'll be added to an `"_"` key in the data, e.g. `"_": {"foo": "bar"}`.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Hello") > doc = nlp("Hello")
> json_doc = doc.to_json() > json_doc = doc.to_json()
> ``` > ```
> >
@ -342,7 +342,7 @@ array of attributes.
> ```python > ```python
> from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA > from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
> from spacy.tokens import Doc > from spacy.tokens import Doc
> doc = nlp(u"Hello world!") > doc = nlp("Hello world!")
> np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
> doc2 = Doc(doc.vocab, words=[t.text for t in doc]) > doc2 = Doc(doc.vocab, words=[t.text for t in doc])
> doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array) > doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
@ -396,7 +396,7 @@ Serialize, i.e. export the document contents to a binary string.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Give it back! He pleaded.") > doc = nlp("Give it back! He pleaded.")
> doc_bytes = doc.to_bytes() > doc_bytes = doc.to_bytes()
> ``` > ```
@ -413,10 +413,9 @@ Deserialize, i.e. import the document contents from a binary string.
> >
> ```python > ```python
> from spacy.tokens import Doc > from spacy.tokens import Doc
> text = u"Give it back! He pleaded." > doc = nlp("Give it back! He pleaded.")
> doc = nlp(text) > doc_bytes = doc.to_bytes()
> bytes = doc.to_bytes() > doc2 = Doc(doc.vocab).from_bytes(doc_bytes)
> doc2 = Doc(doc.vocab).from_bytes(bytes)
> assert doc.text == doc2.text > assert doc.text == doc2.text
> ``` > ```
@ -457,9 +456,9 @@ dictionary mapping attribute names to values as the `"_"` key.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like David Bowie") > doc = nlp("I like David Bowie")
> with doc.retokenize() as retokenizer: > with doc.retokenize() as retokenizer:
> attrs = {"LEMMA": u"David Bowie"} > attrs = {"LEMMA": "David Bowie"}
> retokenizer.merge(doc[2:4], attrs=attrs) > retokenizer.merge(doc[2:4], attrs=attrs)
> ``` > ```
@ -489,7 +488,7 @@ underlying lexeme (if they're context-independent lexical attributes like
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I live in NewYork") > doc = nlp("I live in NewYork")
> with doc.retokenize() as retokenizer: > with doc.retokenize() as retokenizer:
> heads = [(doc[3], 1), doc[2]] > heads = [(doc[3], 1), doc[2]]
> attrs = {"POS": ["PROPN", "PROPN"], > attrs = {"POS": ["PROPN", "PROPN"],
@ -521,9 +520,9 @@ and end token boundaries, the document remains unchanged.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Los Angeles start.") > doc = nlp("Los Angeles start.")
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE") > doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
> assert [t.text for t in doc] == [u"Los Angeles", u"start", u"."] > assert [t.text for t in doc] == ["Los Angeles", "start", "."]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -541,11 +540,11 @@ objects, if the entity recognizer has been applied.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"Mr. Best flew to New York on Saturday morning.") > doc = nlp("Mr. Best flew to New York on Saturday morning.")
> ents = list(doc.ents) > ents = list(doc.ents)
> assert ents[0].label == 346 > assert ents[0].label == 346
> assert ents[0].label_ == u"PERSON" > assert ents[0].label_ == "PERSON"
> assert ents[0].text == u"Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -563,10 +562,10 @@ relative clauses.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"A phrase with another phrase occurs.") > doc = nlp("A phrase with another phrase occurs.")
> chunks = list(doc.noun_chunks) > chunks = list(doc.noun_chunks)
> assert chunks[0].text == u"A phrase" > assert chunks[0].text == "A phrase"
> assert chunks[1].text == u"another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -583,10 +582,10 @@ will be unavailable.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"This is a sentence. Here's another...") > doc = nlp("This is a sentence. Here's another...")
> sents = list(doc.sents) > sents = list(doc.sents)
> assert len(sents) == 2 > assert len(sents) == 2
> assert [s.root.text for s in sents] == [u"is", u"'s"] > assert [s.root.text for s in sents] == ["is", "'s"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -600,7 +599,7 @@ A boolean value indicating whether a word vector is associated with the object.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc.has_vector > assert doc.has_vector
> ``` > ```
@ -616,8 +615,8 @@ vectors.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"I like apples") > doc = nlp("I like apples")
> assert doc.vector.dtype == 'float32' > assert doc.vector.dtype == "float32"
> assert doc.vector.shape == (300,) > assert doc.vector.shape == (300,)
> ``` > ```
@ -632,8 +631,8 @@ The L2 norm of the document's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> doc1 = nlp(u"I like apples") > doc1 = nlp("I like apples")
> doc2 = nlp(u"I like oranges") > doc2 = nlp("I like oranges")
> doc1.vector_norm # 4.54232424414368 > doc1.vector_norm # 4.54232424414368
> doc2.vector_norm # 3.304373298575751 > doc2.vector_norm # 3.304373298575751
> assert doc1.vector_norm != doc2.vector_norm > assert doc1.vector_norm != doc2.vector_norm

149
website/docs/api/docbin.md Normal file
View File

@ -0,0 +1,149 @@
---
title: DocBin
tag: class
new: 2.2
teaser: Pack Doc objects for binary serialization
source: spacy/tokens/_serialize.py
---
The `DocBin` class lets you efficiently serialize the information from a
collection of `Doc` objects. You can control which information is serialized by
passing a list of attribute IDs, and optionally also specify whether the user
data is serialized. The `DocBin` is faster and produces smaller data sizes than
pickle, and allows you to deserialize without executing arbitrary Python code. A
notable downside to this format is that you can't easily extract just one
document from the `DocBin`. The serialization format is gzipped msgpack, where
the msgpack object has the following structure:
```python
### msgpack object strcutrue
{
"attrs": List[uint64], # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
"tokens": bytes, # Serialized numpy uint64 array with the token data
"spaces": bytes, # Serialized numpy boolean array with spaces data
"lengths": bytes, # Serialized numpy int32 array with the doc lengths
"strings": List[unicode] # List of unique strings in the token data
}
```
Strings for the words, tags, labels etc are represented by 64-bit hashes in the
token data, and every string that occurs at least once is passed via the strings
object. This means the storage is more efficient if you pack more documents
together, because you have less duplication in the strings. For usage examples,
see the docs on [serializing `Doc` objects](/usage/saving-loading#docs).
## DocBin.\_\_init\_\_ {#init tag="method"}
Create a `DocBin` object to hold serialized annotations.
> #### Example
>
> ```python
> from spacy.tokens import DocBin
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
> ```
| Argument | Type | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `attrs` | list | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
| `store_user_data` | bool | Whether to include the `Doc.user_data`. Defaults to `False`. |
| **RETURNS** | `DocBin` | The newly constructed object. |
## DocBin.\_\len\_\_ {#len tag="method"}
Get the number of `Doc` objects that were added to the `DocBin`.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["LEMMA"])
> doc = nlp("This is a document to serialize.")
> doc_bin.add(doc)
> assert len(doc_bin) == 1
> ```
| Argument | Type | Description |
| ----------- | ---- | ------------------------------------------- |
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
## DocBin.add {#add tag="method"}
Add a `Doc`'s annotations to the `DocBin` for serialization.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["LEMMA"])
> doc = nlp("This is a document to serialize.")
> doc_bin.add(doc)
> ```
| Argument | Type | Description |
| -------- | ----- | ------------------------ |
| `doc` | `Doc` | The `Doc` object to add. |
## DocBin.get_docs {#get_docs tag="method"}
Recover `Doc` objects from the annotations, using the given vocab.
> #### Example
>
> ```python
> docs = list(doc_bin.get_docs(nlp.vocab))
> ```
| Argument | Type | Description |
| ---------- | ------- | ------------------ |
| `vocab` | `Vocab` | The shared vocab. |
| **YIELDS** | `Doc` | The `Doc` objects. |
## DocBin.merge {#merge tag="method"}
Extend the annotations of this `DocBin` with the annotations from another. Will
raise an error if the pre-defined attrs of the two `DocBin`s don't match.
> #### Example
>
> ```python
> doc_bin1 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin1.add(nlp("Hello world"))
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin2.add(nlp("This is a sentence"))
> merged_bins = doc_bin1.merge(doc_bin2)
> assert len(merged_bins) == 2
> ```
| Argument | Type | Description |
| -------- | -------- | ------------------------------------------- |
| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
## DocBin.to_bytes {#to_bytes tag="method"}
Serialize the `DocBin`'s annotations to a bytestring.
> #### Example
>
> ```python
> doc_bin = DocBin(attrs=["DEP", "HEAD"])
> doc_bin_bytes = doc_bin.to_bytes()
> ```
| Argument | Type | Description |
| ----------- | ----- | ------------------------ |
| **RETURNS** | bytes | The serialized `DocBin`. |
## DocBin.from_bytes {#from_bytes tag="method"}
Deserialize the `DocBin`'s annotations from a bytestring.
> #### Example
>
> ```python
> doc_bin_bytes = doc_bin.to_bytes()
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
> ```
| Argument | Type | Description |
| ------------ | -------- | ---------------------- |
| `bytes_data` | bytes | The data to load from. |
| **RETURNS** | `DocBin` | The loaded `DocBin`. |

View File

@ -0,0 +1,300 @@
---
title: EntityLinker
teaser:
Functionality to disambiguate a named entity in text to a unique knowledge
base identifier.
tag: class
source: spacy/pipeline/pipes.pyx
new: 2.2
---
This class is a subclass of `Pipe` and follows the same API. The pipeline
component is available in the [processing pipeline](/usage/processing-pipelines)
via the ID `"entity_linker"`.
## EntityLinker.Model {#model tag="classmethod"}
Initialize a model for the pipe. The model should implement the
`thinc.neural.Model` API, and should contain a field `tok2vec` that contains the
context encoder. Wrappers are under development for most major machine learning
libraries.
| Name | Type | Description |
| ----------- | ------ | ------------------------------------- |
| `**kwargs` | - | Parameters for initializing the model |
| **RETURNS** | object | The initialized model. |
## EntityLinker.\_\_init\_\_ {#init tag="method"}
Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.create_pipe`](/api/language#create_pipe).
> #### Example
>
> ```python
> # Construction via create_pipe
> entity_linker = nlp.create_pipe("entity_linker")
>
> # Construction from class
> from spacy.pipeline import EntityLinker
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.from_disk("/path/to/model")
> ```
| Name | Type | Description |
| -------------- | ----------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `hidden_width` | int | Width of the hidden layer of the entity linking model, defaults to 128. |
| `incl_prior` | bool | Whether or not to include prior probabilities in the model. Defaults to True. |
| `incl_context` | bool | Whether or not to include the local context in the model (if not: only prior probabilites are used). Defaults to True. |
| **RETURNS** | `EntityLinker` | The newly constructed object. |
## EntityLinker.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
delegate to the [`predict`](/api/entitylinker#predict) and
[`set_annotations`](/api/entitylinker#set_annotations) methods.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> doc = nlp("This is a sentence.")
> # This usually happens under the hood
> processed = entity_linker(doc)
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------ |
| `doc` | `Doc` | The document to process. |
| **RETURNS** | `Doc` | The processed document. |
## EntityLinker.pipe {#pipe tag="method"}
Apply the pipe to a stream of documents. This usually happens under the hood
when the `nlp` object is called on a text and all pipeline components are
applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
[`pipe`](/api/entitylinker#pipe) delegate to the
[`predict`](/api/entitylinker#predict) and
[`set_annotations`](/api/entitylinker#set_annotations) methods.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> for doc in entity_linker.pipe(docs, batch_size=50):
> pass
> ```
| Name | Type | Description |
| ------------ | -------- | ------------------------------------------------------ |
| `stream` | iterable | A stream of documents. |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
## EntityLinker.predict {#predict tag="method"}
Apply the pipeline's model to a batch of docs, without modifying them.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
> ```
| Name | Type | Description |
| ----------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to predict. |
| **RETURNS** | tuple | A `(kb_ids, tensors)` tuple where `kb_ids` are the model's predicted KB identifiers for the entities in the `docs`, and `tensors` are the token representations used to predict these identifiers. |
## EntityLinker.set_annotations {#set_annotations tag="method"}
Modify a batch of documents, using pre-computed entity IDs for a list of named
entities.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict([doc1, doc2])
> entity_linker.set_annotations([doc1, doc2], kb_ids, tensors)
> ```
| Name | Type | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------- |
| `docs` | iterable | The documents to modify. |
| `kb_ids` | iterable | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
| `tensors` | iterable | The token representations used to predict the identifiers. |
## EntityLinker.update {#update tag="method"}
Learn from a batch of documents and gold-standard information, updating both the
pipe's entity linking model and context encoder. Delegates to
[`predict`](/api/entitylinker#predict) and
[`get_loss`](/api/entitylinker#get_loss).
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> losses = {}
> optimizer = nlp.begin_training()
> entity_linker.update([doc1, doc2], [gold1, gold2], losses=losses, sgd=optimizer)
> ```
| Name | Type | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A batch of documents to learn from. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `drop` | float | The dropout rate, used both for the EL model and the context encoder. |
| `sgd` | callable | The optimizer for the EL model. Should take two arguments `weights` and `gradient`, and an optional ID. |
| `losses` | dict | Optional record of the loss during training. The value keyed by the model's name is updated. |
## EntityLinker.get_loss {#get_loss tag="method"}
Find the loss and gradient of loss for the entities in a batch of documents and
their predicted scores.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> kb_ids, tensors = entity_linker.predict(docs)
> loss, d_loss = entity_linker.get_loss(docs, [gold1, gold2], kb_ids, tensors)
> ```
| Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------ |
| `docs` | iterable | The batch of documents. |
| `golds` | iterable | The gold-standard data. Must have the same length as `docs`. |
| `kb_ids` | iterable | KB identifiers representing the model's predictions. |
| `tensors` | iterable | The token representations used to predict the identifiers |
| **RETURNS** | tuple | The loss and the gradient, i.e. `(loss, gradient)`. |
## EntityLinker.set_kb {#set_kb tag="method"}
Define the knowledge base (KB) used for disambiguating named entities to KB
identifiers.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.set_kb(kb)
> ```
| Name | Type | Description |
| ---- | --------------- | ------------------------------- |
| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
## EntityLinker.begin_training {#begin_training tag="method"}
Initialize the pipe for training, using data examples if available. If no model
has been initialized yet, the model is added. Before calling this method, a
knowledge base should have been defined with
[`set_kb`](/api/entitylinker#set_kb).
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.set_kb(kb)
> nlp.add_pipe(entity_linker, last=True)
> optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
> ```
| Name | Type | Description |
| ------------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `gold_tuples` | iterable | Optional gold-standard annotations from which to construct [`GoldParse`](/api/goldparse) objects. |
| `pipeline` | list | Optional list of pipeline components that this component is part of. |
| `sgd` | callable | An optional optimizer. Should take two arguments `weights` and `gradient`, and an optional ID. Will be created via [`EntityLinker`](/api/entitylinker#create_optimizer) if not set. |
| **RETURNS** | callable | An optimizer. |
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
Create an optimizer for the pipeline component.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> optimizer = entity_linker.create_optimizer()
> ```
| Name | Type | Description |
| ----------- | -------- | -------------- |
| **RETURNS** | callable | The optimizer. |
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's EL model, to use the given parameter values.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> with entity_linker.use_params(optimizer.averages):
> entity_linker.to_disk("/best_model")
> ```
| Name | Type | Description |
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## EntityLinker.to_disk {#to_disk tag="method"}
Serialize the pipe to disk.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.to_disk("/path/to/entity_linker")
> ```
| Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityLinker.from_disk {#from_disk tag="method"}
Load the pipe from disk. Modifies the object in place and returns it.
> #### Example
>
> ```python
> entity_linker = EntityLinker(nlp.vocab)
> entity_linker.from_disk("/path/to/entity_linker")
> ```
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
## Serialization fields {#serialization-fields}
During serialization, spaCy will export several data fields used to restore
different aspects of the object. If needed, you can exclude them from
serialization by passing in the string names via the `exclude` argument.
> #### Example
>
> ```python
> data = entity_linker.to_disk("/path", exclude=["vocab"])
> ```
| Name | Description |
| ------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `cfg` | The config file. You usually don't want to exclude this. |
| `model` | The binary model data. You usually don't want to exclude this. |
| `kb` | The knowledge base. You usually don't want to exclude this. |

View File

@ -58,7 +58,7 @@ and all pipeline components are applied to the `Doc` in order. Both
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> doc = nlp(u"This is a sentence.") > doc = nlp("This is a sentence.")
> # This usually happens under the hood > # This usually happens under the hood
> processed = ner(doc) > processed = ner(doc)
> ``` > ```
@ -99,7 +99,7 @@ Apply the pipeline's model to a batch of docs, without modifying them.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> scores = ner.predict([doc1, doc2]) > scores, tensors = ner.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -115,14 +115,15 @@ Modify a batch of documents, using pre-computed scores.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> scores = ner.predict([doc1, doc2]) > scores, tensors = ner.predict([doc1, doc2])
> ner.set_annotations([doc1, doc2], scores) > ner.set_annotations([doc1, doc2], scores, tensors)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | -------- | ---------------------------------------------------------- | | --------- | -------- | ---------------------------------------------------------- |
| `docs` | iterable | The documents to modify. | | `docs` | iterable | The documents to modify. |
| `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. | | `scores` | - | The scores to set, produced by `EntityRecognizer.predict`. |
| `tensors` | iterable | The token representations used to predict the scores. |
## EntityRecognizer.update {#update tag="method"} ## EntityRecognizer.update {#update tag="method"}
@ -210,13 +211,13 @@ Modify the pipe's model, to use the given parameter values.
> >
> ```python > ```python
> ner = EntityRecognizer(nlp.vocab) > ner = EntityRecognizer(nlp.vocab)
> with ner.use_params(): > with ner.use_params(optimizer.averages):
> ner.to_disk("/best_model") > ner.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | ---- | ---------------------------------------------------------------------------------------------------------- | | -------- | ---- | ---------------------------------------------------------------------------------------------------------- |
| `params` | - | The parameter values to use in the model. At the end of the context, the original parameters are restored. | | `params` | dict | The parameter values to use in the model. At the end of the context, the original parameters are restored. |
## EntityRecognizer.add_label {#add_label tag="method"} ## EntityRecognizer.add_label {#add_label tag="method"}

View File

@ -10,7 +10,9 @@ token-based rules or exact phrase matches. It can be combined with the
statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or statistical [`EntityRecognizer`](/api/entityrecognizer) to boost accuracy, or
used on its own to implement a purely rule-based entity recognition system. used on its own to implement a purely rule-based entity recognition system.
After initialization, the component is typically added to the processing After initialization, the component is typically added to the processing
pipeline using [`nlp.add_pipe`](/api/language#add_pipe). pipeline using [`nlp.add_pipe`](/api/language#add_pipe). For usage examples, see
the docs on
[rule-based entity recogntion](/usage/rule-based-matching#entityruler).
## EntityRuler.\_\_init\_\_ {#init tag="method"} ## EntityRuler.\_\_init\_\_ {#init tag="method"}

View File

@ -23,6 +23,7 @@ gradient for those labels will be zero.
| `deps` | iterable | A sequence of strings, representing the syntactic relation types. | | `deps` | iterable | A sequence of strings, representing the syntactic relation types. |
| `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. | | `entities` | iterable | A sequence of named entity annotations, either as BILUO tag strings, or as `(start_char, end_char, label)` tuples, representing the entity positions. If BILUO tag strings, you can specify missing values by setting the tag to None. |
| `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). | | `cats` | dict | Labels for text classification. Each key in the dictionary may be a string or an int, or a `(start_char, end_char, label)` tuple, indicating that the label is applied to only part of the document (usually a sentence). |
| `links` | dict | Labels for entity linking. A dict with `(start_char, end_char)` keys, and the values being dicts with `kb_id:value` entries, representing external KB IDs mapped to either 1.0 (positive) or 0.0 (negative). |
| **RETURNS** | `GoldParse` | The newly constructed object. | | **RETURNS** | `GoldParse` | The newly constructed object. |
## GoldParse.\_\_len\_\_ {#len tag="method"} ## GoldParse.\_\_len\_\_ {#len tag="method"}
@ -44,7 +45,7 @@ Whether the provided syntactic annotations form a projective dependency tree.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| --------------------------------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------ | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `words` | list | The words. | | `words` | list | The words. |
| `tags` | list | The part-of-speech tag annotations. | | `tags` | list | The part-of-speech tag annotations. |
| `heads` | list | The syntactic head annotations. | | `heads` | list | The syntactic head annotations. |
@ -53,6 +54,7 @@ Whether the provided syntactic annotations form a projective dependency tree.
| `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. | | `cand_to_gold` | list | The alignment from candidate tokenization to gold tokenization. |
| `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. | | `gold_to_cand` | list | The alignment from gold tokenization to candidate tokenization. |
| `cats` <Tag variant="new">2</Tag> | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. | | `cats` <Tag variant="new">2</Tag> | list | Entries in the list should be either a label, or a `(start, end, label)` triple. The tuple form is used for categories applied to spans of the document. |
| `links` <Tag variant="new">2.2</Tag> | dict | Keys in the dictionary are `(start_char, end_char)` triples, and the values are dictionaries with `kb_id:value` entries. |
## Utilities {#util} ## Utilities {#util}
@ -67,7 +69,7 @@ Convert a list of Doc objects into the
> ```python > ```python
> from spacy.gold import docs_to_json > from spacy.gold import docs_to_json
> >
> doc = nlp(u"I like London") > doc = nlp("I like London")
> json_data = docs_to_json([doc]) > json_data = docs_to_json([doc])
> ``` > ```
@ -148,7 +150,7 @@ single-token entity.
> ```python > ```python
> from spacy.gold import biluo_tags_from_offsets > from spacy.gold import biluo_tags_from_offsets
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> entities = [(7, 13, "LOC")] > entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities) > tags = biluo_tags_from_offsets(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"] > assert tags == ["O", "O", "U-LOC", "O"]
@ -170,7 +172,7 @@ entity offsets.
> ```python > ```python
> from spacy.gold import offsets_from_biluo_tags > from spacy.gold import offsets_from_biluo_tags
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags) > entities = offsets_from_biluo_tags(doc, tags)
> assert entities == [(7, 13, "LOC")] > assert entities == [(7, 13, "LOC")]
@ -193,7 +195,7 @@ token-based tags, e.g. to overwrite the `doc.ents`.
> ```python > ```python
> from spacy.gold import spans_from_biluo_tags > from spacy.gold import spans_from_biluo_tags
> >
> doc = nlp(u"I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags) > doc.ents = spans_from_biluo_tags(doc, tags)
> ``` > ```

268
website/docs/api/kb.md Normal file
View File

@ -0,0 +1,268 @@
---
title: KnowledgeBase
teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
tag: class
source: spacy/kb.pyx
new: 2.2
---
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
objects, which are plausible external identifiers given a certain textual mention.
Each such `Candidate` holds information from the relevant KB entities,
such as its frequency in text and possible aliases.
Each entity in the knowledge base also has a pre-trained entity vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
Create the knowledge base.
> #### Example
>
> ```python
> from spacy.kb import KnowledgeBase
> vocab = nlp.vocab
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> ```
| Name | Type | Description |
| ----------------------- | ---------------- | ----------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
The length of the fixed-size entity vectors in the knowledge base.
| Name | Type | Description |
| ----------- | ---- | ----------------------------------------- |
| **RETURNS** | int | Length of the fixed-size entity vectors. |
## KnowledgeBase.add_entity {#add_entity tag="method"}
Add an entity to the knowledge base, specifying its corpus frequency
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
> #### Example
>
> ```python
> kb.add_entity(entity="Q42", freq=32, entity_vector=vector1)
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
> ```
| Name | Type | Description |
| --------------- | ------------- | ------------------------------------------------- |
| `entity` | unicode | The unique entity identifier |
| `freq` | float | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"}
Define the full list of entities in the knowledge base, specifying the corpus frequency
and entity vector for each entity.
> #### Example
>
> ```python
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
> ```
| Name | Type | Description |
| ------------- | ------------- | ------------------------------------------------- |
| `entity_list` | iterable | List of unique entity identifiers |
| `freq_list` | iterable | List of entity frequencies |
| `vector_list` | iterable | List of entity vectors |
## KnowledgeBase.add_alias {#add_alias tag="method"}
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
and their prior probabilities. The entity identifiers should refer to entities previously
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
The sum of the prior probabilities should not exceed 1.
> #### Example
>
> ```python
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
> ```
| Name | Type | Description |
| -------------- | ------------- | -------------------------------------------------- |
| `alias` | unicode | The textual mention or alias |
| `entities` | iterable | The potential entities that the alias may refer to |
| `probabilities`| iterable | The prior probabilities of each entity |
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
Get the total number of entities in the knowledge base.
> #### Example
>
> ```python
> total_entities = len(kb)
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------- |
| **RETURNS** | int | The number of entities in the knowledge base. |
## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
Get a list of all entity IDs in the knowledge base.
> #### Example
>
> ```python
> all_entities = kb.get_entity_strings()
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------- |
| **RETURNS** | list | The list of entities in the knowledge base. |
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
Get the total number of aliases in the knowledge base.
> #### Example
>
> ```python
> total_aliases = kb.get_size_aliases()
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------- |
| **RETURNS** | int | The number of aliases in the knowledge base. |
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
Get a list of all aliases in the knowledge base.
> #### Example
>
> ```python
> all_aliases = kb.get_alias_strings()
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------- |
| **RETURNS** | list | The list of aliases in the knowledge base. |
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb/#candidate_init).
> #### Example
>
> ```python
> candidates = kb.get_candidates("Douglas")
> ```
| Name | Type | Description |
| ------------- | ------------- | -------------------------------------------------- |
| `alias` | unicode | The textual mention or alias |
| **RETURNS** | iterable | The list of relevant `Candidate` objects |
## KnowledgeBase.get_vector {#get_vector tag="method"}
Given a certain entity ID, retrieve its pre-trained entity vector.
> #### Example
>
> ```python
> vector = kb.get_vector("Q42")
> ```
| Name | Type | Description |
| ------------- | ------------- | -------------------------------------------------- |
| `entity` | unicode | The entity ID |
| **RETURNS** | vector | The entity vector |
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
Given a certain entity ID and a certain textual mention, retrieve
the prior probability of the fact that the mention links to the entity ID.
> #### Example
>
> ```python
> probability = kb.get_prior_prob("Q42", "Douglas")
> ```
| Name | Type | Description |
| ------------- | ------------- | --------------------------------------------------------------- |
| `entity` | unicode | The entity ID |
| `alias` | unicode | The textual mention or alias |
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
## KnowledgeBase.dump {#dump tag="method"}
Save the current state of the knowledge base to a directory.
> #### Example
>
> ```python
> kb.dump(loc)
> ```
| Name | Type | Description |
| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
should also be the same as the one used to create the KB.
> #### Example
>
> ```python
> from spacy.kb import KnowledgeBase
> from spacy.vocab import Vocab
> vocab = Vocab().from_disk("/path/to/vocab")
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> kb.load_bulk("/path/to/kb")
> ```
| Name | Type | Description |
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly,
but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
of a `KnowledgeBase`.
> #### Example
>
> ```python
> from spacy.kb import Candidate
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> ```
| Name | Type | Description |
| ------------- | --------------- | -------------------------------------------------------------- |
| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. |
| `entity_hash` | int | The hash of the entity's KB ID. |
| `entity_freq` | float | The entity frequency as recorded in the KB. |
| `alias_hash` | int | The hash of the textual mention or alias. |
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
| **RETURNS** | `Candidate` | The newly constructed object. |
## Candidate attributes {#candidate_attributes}
| Name | Type | Description |
| ---------------------- | ------------ | ------------------------------------------------------------------ |
| `entity` | int | The entity's unique KB identifier |
| `entity_` | unicode | The entity's unique KB identifier |
| `alias` | int | The alias or textual mention |
| `alias_` | unicode | The alias or textual mention |
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
| `entity_freq` | long | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity |

View File

@ -45,7 +45,7 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> #### Example > #### Example
> >
> ```python > ```python
> doc = nlp(u"An example sentence. Another sentence.") > doc = nlp("An example sentence. Another sentence.")
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ``` > ```
@ -61,8 +61,8 @@ Pipeline components to prevent from being loaded can now be added as a list to
`disable`, instead of specifying one keyword argument per component. `disable`, instead of specifying one keyword argument per component.
```diff ```diff
- doc = nlp(u"I don't want parsed", parse=False) - doc = nlp("I don't want parsed", parse=False)
+ doc = nlp(u"I don't want parsed", disable=["parser"]) + doc = nlp("I don't want parsed", disable=["parser"])
``` ```
</Infobox> </Infobox>
@ -86,7 +86,7 @@ multiprocessing.
> #### Example > #### Example
> >
> ```python > ```python
> texts = [u"One document.", u"...", u"Lots of documents"] > texts = ["One document.", "...", "Lots of documents"]
> for doc in nlp.pipe(texts, batch_size=50): > for doc in nlp.pipe(texts, batch_size=50):
> assert doc.is_parsed > assert doc.is_parsed
> ``` > ```
@ -140,6 +140,7 @@ Evaluate a model's pipeline components.
| `batch_size` | int | The batch size to use. | | `batch_size` | int | The batch size to use. |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
| `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. | | `component_cfg` <Tag variant="new">2.1</Tag> | dict | Config parameters for specific pipeline components, keyed by component name. |
| **RETURNS** | Scorer | The scorer containing the evaluation scores. |
## Language.begin_training {#begin_training tag="method"} ## Language.begin_training {#begin_training tag="method"}
@ -444,12 +445,13 @@ per component.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------- | | ------------------------------------------ | ----------- | ----------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A container for the lexical types. | | `vocab` | `Vocab` | A container for the lexical types. |
| `tokenizer` | `Tokenizer` | The tokenizer. | | `tokenizer` | `Tokenizer` | The tokenizer. |
| `make_doc` | `lambda text: Doc` | Create a `Doc` object from unicode text. | | `make_doc` | `callable` | Callable that takes a unicode text and returns a `Doc`. |
| `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. | | `pipeline` | list | List of `(name, component)` tuples describing the current processing pipeline, in order. |
| `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. | | `pipe_names` <Tag variant="new">2</Tag> | list | List of pipeline component names, in order. |
| `pipe_labels` <Tag variant="new">2.2</Tag> | dict | List of labels set by the pipeline components, if available, keyed by component name. |
| `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. | | `meta` | dict | Custom meta data for the Language class. If a model is loaded, contains meta data of the model. |
| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | | `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |

View File

@ -35,10 +35,10 @@ Lemmatize a string.
> >
> ```python > ```python
> from spacy.lemmatizer import Lemmatizer > from spacy.lemmatizer import Lemmatizer
> from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES > rules = {"noun": [["s", ""]]}
> lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) > lemmatizer = Lemmatizer(index={}, exceptions={}, rules=rules)
> lemmas = lemmatizer(u"ducks", u"NOUN") > lemmas = lemmatizer("ducks", "NOUN")
> assert lemmas == [u"duck"] > assert lemmas == ["duck"]
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -52,20 +52,21 @@ Lemmatize a string.
Look up a lemma in the lookup table, if available. If no lemma is found, the Look up a lemma in the lookup table, if available. If no lemma is found, the
original string is returned. Languages can provide a original string is returned. Languages can provide a
[lookup table](/usage/adding-languages#lemmatizer) via the `lemma_lookup` [lookup table](/usage/adding-languages#lemmatizer) via the `resources`, set on
variable, set on the individual `Language` class. the individual `Language` class.
> #### Example > #### Example
> >
> ```python > ```python
> lookup = {u"going": u"go"} > lookup = {"going": "go"}
> lemmatizer = Lemmatizer(lookup=lookup) > lemmatizer = Lemmatizer(lookup=lookup)
> assert lemmatizer.lookup(u"going") == u"go" > assert lemmatizer.lookup("going") == "go"
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------------------------- | | ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
| `string` | unicode | The string to look up. | | `string` | unicode | The string to look up. |
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | | **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.is_base_form {#is_base_form tag="method"} ## Lemmatizer.is_base_form {#is_base_form tag="method"}

View File

@ -27,7 +27,7 @@ Change the value of a boolean flag.
> >
> ```python > ```python
> COOL_FLAG = nlp.vocab.add_flag(lambda text: False) > COOL_FLAG = nlp.vocab.add_flag(lambda text: False)
> nlp.vocab[u'spaCy'].set_flag(COOL_FLAG, True) > nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -42,9 +42,9 @@ Check the value of a boolean flag.
> #### Example > #### Example
> >
> ```python > ```python
> is_my_library = lambda text: text in [u"spaCy", u"Thinc"] > is_my_library = lambda text: text in ["spaCy", "Thinc"]
> MY_LIBRARY = nlp.vocab.add_flag(is_my_library) > MY_LIBRARY = nlp.vocab.add_flag(is_my_library)
> assert nlp.vocab[u"spaCy"].check_flag(MY_LIBRARY) == True > assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
@ -59,8 +59,8 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> orange = nlp.vocab[u"orange"] > orange = nlp.vocab["orange"]
> apple_orange = apple.similarity(orange) > apple_orange = apple.similarity(orange)
> orange_apple = orange.similarity(apple) > orange_apple = orange.similarity(apple)
> assert apple_orange == orange_apple > assert apple_orange == orange_apple
@ -78,7 +78,7 @@ A boolean value indicating whether a word vector is associated with the lexeme.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> assert apple.has_vector > assert apple.has_vector
> ``` > ```
@ -93,7 +93,7 @@ A real-valued meaning representation.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> assert apple.vector.dtype == "float32" > assert apple.vector.dtype == "float32"
> assert apple.vector.shape == (300,) > assert apple.vector.shape == (300,)
> ``` > ```
@ -109,8 +109,8 @@ The L2 norm of the lexeme's vector representation.
> #### Example > #### Example
> >
> ```python > ```python
> apple = nlp.vocab[u"apple"] > apple = nlp.vocab["apple"]
> pasta = nlp.vocab[u"pasta"] > pasta = nlp.vocab["pasta"]
> apple.vector_norm # 7.1346845626831055 > apple.vector_norm # 7.1346845626831055
> pasta.vector_norm # 7.759851932525635 > pasta.vector_norm # 7.759851932525635
> assert apple.vector_norm != pasta.vector_norm > assert apple.vector_norm != pasta.vector_norm

318
website/docs/api/lookups.md Normal file
View File

@ -0,0 +1,318 @@
---
title: Lookups
teaser: A container for large lookup tables and dictionaries
tag: class
source: spacy/lookups.py
new: 2.2
---
This class allows convenient accesss to large lookup tables and dictionaries,
e.g. lemmatization data or tokenizer exception lists using Bloom filters.
Lookups are available via the [`Vocab`](/api/vocab) as `vocab.lookups`, so they
can be accessed before the pipeline components are applied (e.g. in the
tokenizer and lemmatizer), as well as within the pipeline components via
`doc.vocab.lookups`.
## Lookups.\_\_init\_\_ {#init tag="method"}
Create a `Lookups` object.
> #### Example
>
> ```python
> from spacy.lookups import Lookups
> lookups = Lookups()
> ```
| Name | Type | Description |
| ----------- | --------- | ----------------------------- |
| **RETURNS** | `Lookups` | The newly constructed object. |
## Lookups.\_\_len\_\_ {#len tag="method"}
Get the current number of tables in the lookups.
> #### Example
>
> ```python
> lookups = Lookups()
> assert len(lookups) == 0
> ```
| Name | Type | Description |
| ----------- | ---- | ------------------------------------ |
| **RETURNS** | int | The number of tables in the lookups. |
## Lookups.\_\contains\_\_ {#contains tag="method"}
Check if the lookups contain a table of a given name. Delegates to
[`Lookups.has_table`](/api/lookups#has_table).
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table")
> assert "some_table" in lookups
> ```
| Name | Type | Description |
| ----------- | ------- | ----------------------------------------------- |
| `name` | unicode | Name of the table. |
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.tables {#tables tag="property"}
Get the names of all tables in the lookups.
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table")
> assert lookups.tables == ["some_table"]
> ```
| Name | Type | Description |
| ----------- | ---- | ----------------------------------- |
| **RETURNS** | list | Names of the tables in the lookups. |
## Lookups.add_table {#add_table tag="method"}
Add a new table with optional data to the lookups. Raises an error if the table
exists.
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table", {"foo": "bar"})
> ```
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------------- |
| `name` | unicode | Unique name of the table. |
| `data` | dict | Optional data to add to the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
## Lookups.get_table {#get_table tag="method"}
Get a table from the lookups. Raises an error if the table doesn't exist.
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table", {"foo": "bar"})
> table = lookups.get_table("some_table")
> assert table["foo"] == "bar"
> ```
| Name | Type | Description |
| ----------- | ----------------------------- | ------------------ |
| `name` | unicode | Name of the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
## Lookups.remove_table {#remove_table tag="method"}
Remove a table from the lookups. Raises an error if the table doesn't exist.
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table")
> removed_table = lookups.remove_table("some_table")
> assert "some_table" not in lookups
> ```
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------- |
| `name` | unicode | Name of the table to remove. |
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
## Lookups.has_table {#has_table tag="method"}
Check if the lookups contain a table of a given name. Equivalent to
[`Lookups.__contains__`](/api/lookups#contains).
> #### Example
>
> ```python
> lookups = Lookups()
> lookups.add_table("some_table")
> assert lookups.has_table("some_table")
> ```
| Name | Type | Description |
| ----------- | ------- | ----------------------------------------------- |
| `name` | unicode | Name of the table. |
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.to_bytes {#to_bytes tag="method"}
Serialize the lookups to a bytestring.
> #### Example
>
> ```python
> lookup_bytes = lookups.to_bytes()
> ```
| Name | Type | Description |
| ----------- | ----- | ----------------------- |
| **RETURNS** | bytes | The serialized lookups. |
## Lookups.from_bytes {#from_bytes tag="method"}
Load the lookups from a bytestring.
> #### Example
>
> ```python
> lookup_bytes = lookups.to_bytes()
> lookups = Lookups()
> lookups.from_bytes(lookup_bytes)
> ```
| Name | Type | Description |
| ------------ | --------- | ---------------------- |
| `bytes_data` | bytes | The data to load from. |
| **RETURNS** | `Lookups` | The loaded lookups. |
## Lookups.to_disk {#to_disk tag="method"}
Save the lookups to a directory as `lookups.bin`. Expects a path to a directory,
which will be created if it doesn't exist.
> #### Example
>
> ```python
> lookups.to_disk("/path/to/lookups")
> ```
| Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Lookups.from_disk {#from_disk tag="method"}
Load lookups from a directory containing a `lookups.bin`. Will skip loading if
the file doesn't exist.
> #### Example
>
> ```python
> from spacy.lookups import Lookups
> lookups = Lookups()
> lookups.from_disk("/path/to/lookups")
> ```
| Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `Lookups` | The loaded lookups. |
## Table {#table tag="class, ordererddict"}
A table in the lookups. Subclass of `OrderedDict` that implements a slightly
more consistent and unified API and includes a Bloom filter to speed up missed
lookups. Supports **all other methods and attributes** of `OrderedDict` /
`dict`, and the customized methods listed here. Methods that get or set keys
accept both integers and strings (which will be hashed before being added to the
table).
### Table.\_\_init\_\_ {#table.init tag="method"}
Initialize a new table.
> #### Example
>
> ```python
> from spacy.lookups import Table
> data = {"foo": "bar", "baz": 100}
> table = Table(name="some_table", data=data)
> assert "foo" in table
> assert table["foo"] == "bar"
> ```
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
| `name` | unicode | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.from_dict {#table.from_dict tag="classmethod"}
Initialize a new table from a dict.
> #### Example
>
> ```python
> from spacy.lookups import Table
> data = {"foo": "bar", "baz": 100}
> table = Table.from_dict(data, name="some_table")
> ```
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
| `data` | dict | The dictionary. |
| `name` | unicode | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.set {#table.set tag="method"}
Set a new key / value pair. String keys will be hashed. Same as
`table[key] = value`.
> #### Example
>
> ```python
> from spacy.lookups import Table
> table = Table()
> table.set("foo", "bar")
> assert table["foo"] == "bar"
> ```
| Name | Type | Description |
| ------- | ------------- | ----------- |
| `key` | unicode / int | The key. |
| `value` | - | The value. |
### Table.to_bytes {#table.to_bytes tag="method"}
Serialize the table to a bytestring.
> #### Example
>
> ```python
> table_bytes = table.to_bytes()
> ```
| Name | Type | Description |
| ----------- | ----- | --------------------- |
| **RETURNS** | bytes | The serialized table. |
### Table.from_bytes {#table.from_bytes tag="method"}
Load a table from a bytestring.
> #### Example
>
> ```python
> table_bytes = table.to_bytes()
> table = Table()
> table.from_bytes(table_bytes)
> ```
| Name | Type | Description |
| ------------ | ------- | ----------------- |
| `bytes_data` | bytes | The data to load. |
| **RETURNS** | `Table` | The loaded table. |
### Attributes {#table-attributes}
| Name | Type | Description |
| -------------- | --------------------------- | ----------------------------------------------------- |
| `name` | unicode | Table name. |
| `default_size` | int | Default size of bloom filters if no data is provided. |
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |

View File

@ -50,7 +50,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> pattern = [{"LOWER": "hello"}, {"LOWER": "world"}] > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
> matcher.add("HelloWorld", None, pattern) > matcher.add("HelloWorld", None, pattern)
> doc = nlp(u'hello world!') > doc = nlp("hello world!")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
@ -147,7 +147,7 @@ overwritten.
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}]) > matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
> matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}]) > matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
> doc = nlp(u"HELLO WORLD on Google Maps.") > doc = nlp("HELLO WORLD on Google Maps.")
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```

Some files were not shown because too many files have changed in this diff Show More