From e2299dc389bbf84ee1bd56edc23202ec5f9249e2 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:14 +0200 Subject: [PATCH 001/111] Ensure path in save_to_directory --- spacy/language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language.py b/spacy/language.py index f47b1d0cc..b356f4d8c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -350,6 +350,7 @@ class Language(object): 'ner': self.entity.cfg if self.entity else {}, } + path = util.ensure_path(path) self.setup_directory(path, **configs) strings_loc = path / 'vocab' / 'strings.json' From 8e83f8e2fabef373faece4802737567b2768a357 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:26 +0200 Subject: [PATCH 002/111] Update docstrings --- spacy/language.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index b356f4d8c..de97f7a63 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -188,6 +188,9 @@ class Language(object): @classmethod def setup_directory(cls, path, **configs): + """ + Initialise a model directory. + """ for name, config in configs.items(): directory = path / name if directory.exists(): @@ -295,7 +298,7 @@ class Language(object): and can contain arbtrary whitespace. Alignment into the original string is preserved. - Args: + Argsuments: text (unicode): The text to be processed. Returns: @@ -344,6 +347,12 @@ class Language(object): yield doc def save_to_directory(self, path): + """ + Save the Vocab, StringStore and pipeline to a directory. + + Arguments: + path (string or pathlib path): Path to save the model. + """ configs = { 'pos': self.tagger.cfg if self.tagger else {}, 'deps': self.parser.cfg if self.parser else {}, From aad80a291f481fdbc75f9def859f146d2921da81 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:34 +0200 Subject: [PATCH 003/111] Add save_to_directory method to API docs --- website/docs/api/language.jade | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 93e7ff213..e221b9142 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -136,3 +136,19 @@ p +cell yield +cell #[code Doc] +cell Containers for accessing the linguistic annotations. + ++h(2, "save_to_directory") Language.save_to_directory + +tag method + +p Save the #[code Vocab], #[code StringStore] and pipeline to a directory. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell string or pathlib path + +cell Path to save the model. + + +footrow + +cell return + +cell #[code None] + +cell - From 7f776258f06bc88e908f725c1b0cf7b88ad4c73c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:41:46 +0200 Subject: [PATCH 004/111] Add link to API docs --- website/docs/usage/saving-loading.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 063c5dc50..9fa23aaa7 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -2,7 +2,8 @@ include ../../_includes/_mixins p | After training your model, you'll usually want to save its state, and load - | it back later. You can do this with the #[code Language.save_to_directory()] + | it back later. You can do this with the + | #[+api("language#save_to_directory") #[code Language.save_to_directory()]] | method: +code. From 2ab394d655677d56dcd508fdd520598c3b25093b Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:45:00 +0200 Subject: [PATCH 005/111] Fix whitespace --- website/docs/usage/saving-loading.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 9fa23aaa7..a3edfce50 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -24,7 +24,7 @@ p | there can be a convenient solution to share them with your team. p - | spaCy comes with a handy CLI command that will create all required files, + | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the From f62b740961dbd7635206c2d93c54d2fe952d5822 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:46:14 +0200 Subject: [PATCH 006/111] Use compat.json_dumps --- spacy/language.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index de97f7a63..9ce90fdef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, unicode_literals from contextlib import contextmanager import shutil -import ujson from .tokenizer import Tokenizer from .vocab import Vocab @@ -15,7 +14,7 @@ from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown -from .compat import unicode_ +from .compat import json_dumps from .attrs import IS_STOP from . import attrs from . import orth @@ -197,9 +196,7 @@ class Language(object): shutil.rmtree(str(directory)) directory.mkdir() with (directory / 'config.json').open('wb') as file_: - data = ujson.dumps(config, indent=2) - if isinstance(data, unicode_): - data = data.encode('utf8') + data = json_dumps(config) file_.write(data) if not (path / 'vocab').exists(): (path / 'vocab').mkdir() From ddd5194088dbc229de0caf4f2f5128f8c974f5ee Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:52:13 +0200 Subject: [PATCH 007/111] Update Language docs and docstrings --- spacy/language.py | 9 +++++++++ website/docs/api/language.jade | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9ce90fdef..854b0ebeb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -242,6 +242,15 @@ class Language(object): self.save_to_directory(path) def __init__(self, **overrides): + """ + Create or load the pipeline. + + Arguments: + **overrides: Keyword arguments indicating which defaults to override. + + Returns: + Language: The newly constructed object. + """ if 'data_dir' in overrides and 'path' not in overrides: raise ValueError("The argument 'data_dir' has been renamed to 'path'") path = util.ensure_path(overrides.get('path', True)) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index e221b9142..d7090c870 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -55,14 +55,14 @@ p Create or load the pipeline. +table(["Name", "Type", "Description"]) +row - +cell #[code **kwrags] + +cell #[code **overrides] +cell - +cell Keyword arguments indicating which defaults to override. +footrow +cell return +cell #[code Language] - +cell #[code self] + +cell The newly constructed object. +h(2, "call") Language.__call__ +tag method From 2f84626417b339b42fd1485b53abff8858a7ded3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 18 Apr 2017 13:47:36 +0200 Subject: [PATCH 008/111] Fix train_new_entity_type example --- examples/training/train_new_entity_type.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cbe2963d3..23cb86596 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -4,7 +4,6 @@ import random from pathlib import Path import spacy -from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse from spacy.tagger import Tagger @@ -25,10 +24,13 @@ def train_ner(nlp, train_data, output_dir): loss = nlp.entity.update(doc, gold) nlp.end_training() if output_dir: + if not output_dir.exists(): + output_dir.mkdir() nlp.save_to_directory(output_dir) def main(model_name, output_directory=None): + print("Loading initial model", model_name) nlp = spacy.load(model_name) if output_directory is not None: output_directory = Path(output_directory) @@ -52,13 +54,14 @@ def main(model_name, output_directory=None): ) ] nlp.entity.add_label('ANIMAL') - ner = train_ner(nlp, train_data, output_directory) + train_ner(nlp, train_data, output_directory) # Test that the entity is recognized doc = nlp('Do you like horses?') for ent in doc.ents: print(ent.label_, ent.text) if output_directory: + print("Loading from", output_directory) nlp2 = spacy.load('en', path=output_directory) nlp2.entity.add_label('ANIMAL') doc2 = nlp2('Do you like horses?') From 48da244058eac217aed80d59583fb77cc276bd96 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 19 Apr 2017 11:50:33 +0200 Subject: [PATCH 009/111] Use spacy.compat.json_dumps for Python 2/3 compatibility (resolves #991) --- spacy/cli/converters/conllu2json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 3c5ebb0e4..cf473b4a0 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +from ...compat import json_dumps from ... import util @@ -29,7 +30,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): output_filename = input_path.parts[-1].replace(".conllu", ".json") output_file = output_path / output_filename - json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2) + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) util.print_msg("Created {} documents".format(len(docs)), title="Generated output file {}".format(output_file)) From b763e9b66d1d578b71397c17888f16e9ac485194 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 19 Apr 2017 12:00:12 +0200 Subject: [PATCH 010/111] Add note about variable naming --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 327f3d58e..c8bed34ed 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,6 +95,8 @@ At the time of writing (v1.7), spaCy's serialization and deserialization functio Although spaCy uses a lot of classes, inheritance is viewed with some suspicion — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. +We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` --- you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. + ### Cython conventions spaCy's core data structures are implemented as [Cython](http://cython.org/) `cdef` classes. Memory is managed through the `cymem.cymem.Pool` class, which allows you to allocate memory which will be freed when the `Pool` object is garbage collected. This means you usually don't have to worry about freeing memory. You just have to decide which Python object owns the memory, and make it own the `Pool`. When that object goes out of scope, the memory will be freed. You do have to take care that no pointers outlive the object that owns them — but this is generally quite easy. From 275fc9f78a9a1e9069a234fbb48da9bbe5e26532 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 19 Apr 2017 12:09:10 +0200 Subject: [PATCH 011/111] Update CONTRIBUTING.md --- CONTRIBUTING.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c8bed34ed..f1053405e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -87,7 +87,16 @@ Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/). Re ### Python conventions -All Python code must be written in an **intersection of Python 2 and Python 3**. This is easy in Cython, but somewhat ugly in Python. We could use some extra utilities for this. Please pay particular attention to code that serialises json objects. +All Python code must be written in an **intersection of Python 2 and Python 3**. This is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or platform compatibility should only live in [`spacy.compat`](spacy/compat.py). To distinguish them from the builtin functions, replacement functions are suffixed with an undersocre, for example `unicode_`. If you need to access the user's version or platform information, for example to show more specific error messages, you can use the `is_config()` helper function. + +```python +from .compat import unicode_, json_dumps, is_config + +compatible_unicode = unicode_('hello world') +compatible_json = json_dumps({'key': 'value'}) +if is_config(windows=True, python2=True): + print("You are using Python 2 on Windows.") +``` Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check whether the path is provided as a string. Strings should be converted to `pathlib.Path` objects. @@ -95,7 +104,7 @@ At the time of writing (v1.7), spaCy's serialization and deserialization functio Although spaCy uses a lot of classes, inheritance is viewed with some suspicion — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. -We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` --- you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. +We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` — you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. ### Cython conventions @@ -128,7 +137,7 @@ cdef int c_total(const int* int_array, int length) nogil: return total ``` -If this is confusing, consider that the compiler couldn't deal with `for item in int_array:` — there's no length attached to a raw pointer, so how could we figure out where to stop? The length is provided in the slice notation as a solution to this. Note that we don't have to declare the type of `item` in the code above -- the compiler can easily infer it. This gives us tidy code that looks quite like Python, but is exactly as fast as C — because we've made sure the compilation to C is trivial. +If this is confusing, consider that the compiler couldn't deal with `for item in int_array:` — there's no length attached to a raw pointer, so how could we figure out where to stop? The length is provided in the slice notation as a solution to this. Note that we don't have to declare the type of `item` in the code above — the compiler can easily infer it. This gives us tidy code that looks quite like Python, but is exactly as fast as C — because we've made sure the compilation to C is trivial. Your functions cannot be declared `nogil` if they need to create Python objects or call Python functions. This is perfectly okay — you shouldn't torture your code just to get `nogil` functions. However, if your function isn't `nogil`, you should compile your module with `cython -a --cplus my_module.pyx` and open the resulting `my_module.html` file in a browser. This will let you see how Cython is compiling your code. Calls into the Python run-time will be in bright yellow. This lets you easily see whether Cython is able to correctly type your code, or whether there are unexpected problems. From 2bd89e7ade7bfb01a82380f65caa30d1d9fc9fb2 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 19 Apr 2017 19:28:00 +0200 Subject: [PATCH 012/111] Tidy up Hebrew tests and test for punctuation (see #995) --- spacy/tests/he/test_tokenizer.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py index a6c65805a..c2504a0e7 100644 --- a/spacy/tests/he/test_tokenizer.py +++ b/spacy/tests/he/test_tokenizer.py @@ -3,15 +3,21 @@ from __future__ import unicode_literals import pytest -ABBREVIATION_TESTS = [ - ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית']) -] -TESTCASES = ABBREVIATION_TESTS - - -@pytest.mark.parametrize('text,expected_tokens', TESTCASES) -def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens): +@pytest.mark.parametrize('text,expected_tokens', + [('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])]) +def test_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens): tokens = he_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] - assert expected_tokens == token_list \ No newline at end of file + assert expected_tokens == token_list + + +@pytest.mark.parametrize('text,expected_tokens', [ + pytest.mark.xfail(('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.'])), + ('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']), + ('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']), + ('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']), + ('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])]) +def test_tokenizer_handles_punct(he_tokenizer, text, expected_tokens): + tokens = he_tokenizer(text) + assert expected_tokens == [token.text for token in tokens] From bc9557b21ffde564e39a98a0819fd6a63f47e11a Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Thu, 23 Mar 2017 11:10:22 +0100 Subject: [PATCH 013/111] Norwegian language basics --- spacy/nb/__init__.py | 26 +++++ spacy/nb/language_data.py | 28 +++++ spacy/nb/morph_rules.py | 67 ++++++++++++ spacy/nb/stop_words.py | 40 +++++++ spacy/nb/tokenizer_exceptions.py | 175 +++++++++++++++++++++++++++++++ 5 files changed, 336 insertions(+) create mode 100644 spacy/nb/__init__.py create mode 100644 spacy/nb/language_data.py create mode 100644 spacy/nb/morph_rules.py create mode 100644 spacy/nb/stop_words.py create mode 100644 spacy/nb/tokenizer_exceptions.py diff --git a/spacy/nb/__init__.py b/spacy/nb/__init__.py new file mode 100644 index 000000000..6c1aab979 --- /dev/null +++ b/spacy/nb/__init__.py @@ -0,0 +1,26 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG + + +# Import language-specific data +from .language_data import * + + +# create Language subclass +class NorwegianBokmal(Language): + lang = 'nb' # ISO code + + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nb' + + + # override defaults + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + #tag_map = TAG_MAP + stop_words = STOP_WORDS \ No newline at end of file diff --git a/spacy/nb/language_data.py b/spacy/nb/language_data.py new file mode 100644 index 000000000..9383f3a62 --- /dev/null +++ b/spacy/nb/language_data.py @@ -0,0 +1,28 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc, expand_exc + +# import language-specific data from files +#from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY +from .morph_rules import MORPH_RULES + +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +#TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + +# customize tokenizer exceptions +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) +update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) + +# export +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"] \ No newline at end of file diff --git a/spacy/nb/morph_rules.py b/spacy/nb/morph_rules.py new file mode 100644 index 000000000..1f9f1b84e --- /dev/null +++ b/spacy/nb/morph_rules.py @@ -0,0 +1,67 @@ +# encoding: utf8 +# norwegian bokmål +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + +# Used the table of pronouns at https://no.wiktionary.org/wiki/Tillegg:Pronomen_i_norsk + +MORPH_RULES = { + "PRP": { + "jeg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Nom"}, + "meg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Case": "Acc"}, + "du": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Nom"}, + "deg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Case": "Acc"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Nom"}, + "ham": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "han": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Masc", "Case": "Acc"}, + "hun": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Nom"}, + "henne": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Fem", "Case": "Acc"}, + "den": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "det": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Gender": "Neut"}, + "seg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Reflex": "Yes"}, + "vi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Nom"}, + "oss": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Case": "Acc"}, + "dere": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Case": "Nom"}, + "de": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Nom"}, + "dem": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Case": "Acc"}, + "seg": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Plur", "Reflex": "Yes"}, + + "min": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "mi": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "mitt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "mine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "din": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "di": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "ditt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "dine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Two", "Number": "Plur", "Poss": "Yes"}, + "hans": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Masc"}, + "hennes": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Fem"}, + "dens": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "dets": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "Three", "Number": "Sing", "Poss": "Yes", "Gender": "Neu"}, + "vår": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "vårt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes"}, + "våre": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Gender":"Neu"}, + "deres": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Gender":"Neu", "Reflex":"Yes"}, + "sin": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Masc", "Reflex":"Yes"}, + "si": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Fem", "Reflex":"Yes"}, + "sitt": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Sing", "Poss": "Yes", "Gender":"Neu", "Reflex":"Yes"}, + "sine": {LEMMA: PRON_LEMMA, "PronType": "Prs", "Person": "One", "Number": "Plur", "Poss": "Yes", "Reflex":"Yes"}, + }, + + "VBZ": { + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "One", "Tense": "Pres", "Mood": "Ind"}, + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "Two", "Tense": "Pres", "Mood": "Ind"}, + "er": {LEMMA: "be", "VerbForm": "Fin", "Person": "Three", "Tense": "Pres", "Mood": "Ind"}, + }, + + "VBP": { + "er": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Pres", "Mood": "Ind"} + }, + + "VBD": { + "var": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, + "vært": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} + } +} \ No newline at end of file diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py new file mode 100644 index 000000000..62d1a6028 --- /dev/null +++ b/spacy/nb/stop_words.py @@ -0,0 +1,40 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +STOP_WORDS = set("""alle at av + +bare begge ble blei bli blir blitt både båe + +da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då + +eg ein eit eitt eller elles en enn er et ett etter + +for fordi fra før + +ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor + +i ikke ikkje ikkje ingen ingi inkje inn inni + +ja jeg + +kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor + +man mange me med medan meg meget mellom men mi min mine mitt mot mykje + +ned no noe noen noka noko nokon nokor nokre nå når + +og også om opp oss over + +på + +samme seg selv si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn + +til + +um upp ut uten + +var vart varte ved vere verte vi vil ville vore vors vort vår være være vært + +å +""".split()) \ No newline at end of file diff --git a/spacy/nb/tokenizer_exceptions.py b/spacy/nb/tokenizer_exceptions.py new file mode 100644 index 000000000..44fc76532 --- /dev/null +++ b/spacy/nb/tokenizer_exceptions.py @@ -0,0 +1,175 @@ +# encoding: utf8 +# Norwegian bokmaål +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + + +TOKENIZER_EXCEPTIONS = { + "jan.": [ + {ORTH: "jan.", LEMMA: "januar"} + ], + + "feb.": [ + {ORTH: "feb.", LEMMA: "februar"} + ], + + "jul.": [ + {ORTH: "jul.", LEMMA: "juli"} + ] +} + + +ORTH_ONLY = ["adm.dir.", + "a.m.", + "Aq.", + "b.c.", + "bl.a.", + "bla.", + "bm.", + "bto.", + "ca.", + "cand.mag.", + "c.c.", + "co.", + "d.d.", + "dept.", + "d.m.", + "dr.philos.", + "dvs.", + "d.y.", + "E. coli", + "eg.", + "ekskl.", + "e.Kr.", + "el.", + "e.l.", + "et.", + "etg.", + "ev.", + "evt.", + "f.", + "f.eks.", + "fhv.", + "fk.", + "f.Kr.", + "f.o.m.", + "foreg.", + "fork.", + "fv.", + "fvt.", + "g.", + "gt.", + "gl.", + "gno.", + "gnr.", + "grl.", + "hhv.", + "hoh.", + "hr.", + "h.r.adv.", + "ifb.", + "ifm.", + "iht.", + "inkl.", + "istf.", + "jf.", + "jr.", + "jun.", + "kfr.", + "kgl.res.", + "kl.", + "komm.", + "kst.", + "lø.", + "ma.", + "mag.art.", + "m.a.o.", + "md.", + "mfl.", + "mill.", + "min.", + "m.m.", + "mnd.", + "moh.", + "Mr.", + "muh.", + "mv.", + "mva.", + "ndf.", + "no.", + "nov.", + "nr.", + "nto.", + "nyno.", + "n.å.", + "o.a.", + "off.", + "ofl.", + "okt.", + "o.l.", + "on.", + "op.", + "osv.", + "ovf.", + "p.", + "p.a.", + "Pb.", + "pga.", + "ph.d.", + "pkt.", + "p.m.", + "pr.", + "pst.", + "p.t.", + "red.anm.", + "ref.", + "res.", + "res.kap.", + "resp.", + "rv.", + "s.", + "s.d.", + "sen.", + "sep.", + "siviling.", + "sms.", + "spm.", + "sr.", + "sst.", + "st.", + "stip.", + "stk.", + "st.meld.", + "st.prp.", + "stud.", + "s.u.", + "sv.", + "sø.", + "s.å.", + "såk.", + "temp.", + "ti.", + "tils.", + "tilsv.", + "tl;dr", + "tlf.", + "to.", + "t.o.m.", + "ult.", + "utg.", + "v.", + "vedk.", + "vedr.", + "vg.", + "vgs.", + "vha.", + "vit.ass.", + "vn.", + "vol.", + "vs.", + "vsa.", + "årg.", + "årh." +] \ No newline at end of file From 3796c668d9c0d33c472262e18669d680c4b2409f Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Tue, 28 Mar 2017 14:10:20 +0200 Subject: [PATCH 014/111] more norwegian --- spacy/__init__.py | 43 +++++++++++++++++++++-- spacy/nb/__init__.py | 7 ++-- spacy/nb/stop_words.py | 79 +++++++++++++++++++++++------------------- 3 files changed, 87 insertions(+), 42 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index efd6c00c0..7b2769ecd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,19 +3,52 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name -from .cli import info +from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he + +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) for _lang in _languages: util.set_lang_class(_lang.lang, _lang) +from . import en +from . import de +from . import zh +from . import es +from . import it +from . import hu +from . import fr +from . import pt +from . import nl +from . import sv +from . import fi +from . import bn +from . import nb + +from .about import * + + +set_lang_class(en.English.lang, en.English) +set_lang_class(de.German.lang, de.German) +set_lang_class(es.Spanish.lang, es.Spanish) +set_lang_class(pt.Portuguese.lang, pt.Portuguese) +set_lang_class(fr.French.lang, fr.French) +set_lang_class(it.Italian.lang, it.Italian) +set_lang_class(hu.Hungarian.lang, hu.Hungarian) +set_lang_class(zh.Chinese.lang, zh.Chinese) +set_lang_class(nl.Dutch.lang, nl.Dutch) +set_lang_class(sv.Swedish.lang, sv.Swedish) +set_lang_class(fi.Finnish.lang, fi.Finnish) +set_lang_class(bn.Bengali.lang, bn.Bengali) +set_lang_class(nb.Norwegian.lang, nb.Norwegian) +>>>>>>> more norwegian + def load(name, **overrides): if overrides.get('path') in (None, False, True): @@ -38,3 +71,7 @@ def load(name, **overrides): overrides['meta'] = meta overrides['path'] = model_path return cls(**overrides) + + +def info(name, markdown): + info(name, markdown) diff --git a/spacy/nb/__init__.py b/spacy/nb/__init__.py index 6c1aab979..6389356a7 100644 --- a/spacy/nb/__init__.py +++ b/spacy/nb/__init__.py @@ -15,12 +15,11 @@ from .language_data import * class NorwegianBokmal(Language): lang = 'nb' # ISO code - class Defaults(Language.Defaults): + class Defaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) lex_attr_getters[LANG] = lambda text: 'nb' - - + # override defaults tokenizer_exceptions = TOKENIZER_EXCEPTIONS #tag_map = TAG_MAP - stop_words = STOP_WORDS \ No newline at end of file + stop_words = STOP_WORDS \ No newline at end of file diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py index 62d1a6028..a4d10ad76 100644 --- a/spacy/nb/stop_words.py +++ b/spacy/nb/stop_words.py @@ -1,40 +1,49 @@ # encoding: utf8 from __future__ import unicode_literals - -STOP_WORDS = set("""alle at av - -bare begge ble blei bli blir blitt både båe - -da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då - -eg ein eit eitt eller elles en enn er et ett etter - -for fordi fra før - -ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor - -i ikke ikkje ikkje ingen ingi inkje inn inni - + +STOP_WORDS = set(""" +alle allerede alt and andre annen annet at av + +bak bare bedre beste blant ble bli blir blitt bris by både + +da dag de del dem den denne der dermed det dette disse drept du + +eller en enn er et ett etter + +fem fikk fire fjor flere folk for fortsatt fotball fra fram frankrike fredag funnet få får fått før først første + +gang gi gikk gjennom gjorde gjort gjør gjøre god godt grunn gå går + +ha hadde ham han hans har hele helt henne hennes her hun hva hvor hvordan hvorfor + +i ifølge igjen ikke ingen inn + ja jeg - -kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor - -man mange me med medan meg meget mellom men mi min mine mitt mot mykje - -ned no noe noen noka noko nokon nokor nokre nå når - -og også om opp oss over - -på - -samme seg selv si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn - -til - -um upp ut uten - -var vart varte ved vere verte vi vil ville vore vors vort vår være være vært - -å + +kamp kampen kan kl klart kom komme kommer kontakt kort kroner kunne kveld kvinner + +la laget land landet langt leder ligger like litt løpet lørdag + +man mandag mange mannen mars med meg mellom men mener menn mennesker mens mer millioner minutter mot msci mye må mål måtte + +ned neste noe noen nok norge norsk norske ntb ny nye nå når + +og også om onsdag opp opplyser oslo oss over + +personer plass poeng politidistrikt politiet president prosent på + +regjeringen runde rundt russland + +sa saken samme sammen samtidig satt se seg seks selv senere september ser sett siden sier sin sine siste sitt skal skriver skulle slik som sted stedet stor store står sverige svært så søndag + +ta tatt tid tidligere til tilbake tillegg tirsdag to tok torsdag tre tror tyskland + +under usa ut uten utenfor + +vant var ved veldig vi videre viktig vil ville viser vår være vært + +å år + +ønsker """.split()) \ No newline at end of file From ff900ffd7cf4b038b7091960dccf23a8bb905c93 Mon Sep 17 00:00:00 2001 From: luvogels Date: Sat, 8 Apr 2017 17:06:12 +0200 Subject: [PATCH 015/111] Update setup.py added nb --- setup.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 setup.py diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index f7cd0ddcb..d8fbe9bb4 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', + 'spacy.nb', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', From e821c484895d42be9a90743ffd8d31a0469f841b Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Thu, 23 Mar 2017 11:10:22 +0100 Subject: [PATCH 016/111] Norwegian language basics --- spacy/nb/stop_words.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py index a4d10ad76..ef6f13717 100644 --- a/spacy/nb/stop_words.py +++ b/spacy/nb/stop_words.py @@ -1,6 +1,7 @@ # encoding: utf8 from __future__ import unicode_literals +<<<<<<< HEAD STOP_WORDS = set(""" alle allerede alt and andre annen annet at av @@ -46,4 +47,42 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært å år ønsker +======= + +STOP_WORDS = set("""alle at av + +bare begge ble blei bli blir blitt både båe + +da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då + +eg ein eit eitt eller elles en enn er et ett etter + +for fordi fra før + +ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor + +i ikke ikkje ikkje ingen ingi inkje inn inni + +ja jeg + +kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor + +man mange me med medan meg meget mellom men mi min mine mitt mot mykje + +ned no noe noen noka noko nokon nokor nokre nå når + +og også om opp oss over + +på + +samme seg selv si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn + +til + +um upp ut uten + +var vart varte ved vere verte vi vil ville vore vors vort vår være være vært + +å +>>>>>>> Norwegian language basics """.split()) \ No newline at end of file From 538a8d6b129a86cf35c84bbce5137df7fca41f02 Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Mon, 10 Apr 2017 17:45:25 +0200 Subject: [PATCH 017/111] Resolved merge conflict by incorporating both suggestions. --- spacy/__init__.py | 5 ++++- spacy/nb/stop_words.py | 41 +---------------------------------------- 2 files changed, 5 insertions(+), 41 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 7b2769ecd..0e7e8603f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name from .cli.info import info +<<<<<<< HEAD from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb @@ -29,6 +30,7 @@ from . import nl from . import sv from . import fi from . import bn +from . import he from . import nb from .about import * @@ -47,7 +49,8 @@ set_lang_class(sv.Swedish.lang, sv.Swedish) set_lang_class(fi.Finnish.lang, fi.Finnish) set_lang_class(bn.Bengali.lang, bn.Bengali) set_lang_class(nb.Norwegian.lang, nb.Norwegian) ->>>>>>> more norwegian +set_lang_class(he.Hebrew.lang, he.Hebrew) +set_lang_class(nb.Norwegian.lang, nb.Norwegian) def load(name, **overrides): diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py index ef6f13717..56e0ef3bc 100644 --- a/spacy/nb/stop_words.py +++ b/spacy/nb/stop_words.py @@ -1,7 +1,5 @@ # encoding: utf8 from __future__ import unicode_literals - -<<<<<<< HEAD STOP_WORDS = set(""" alle allerede alt and andre annen annet at av @@ -47,42 +45,5 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært å år ønsker -======= - -STOP_WORDS = set("""alle at av - -bare begge ble blei bli blir blitt både båe - -da de deg dei deim deira deires dem den denne der dere deres det dette di din disse ditt du dykk dykkar då - -eg ein eit eitt eller elles en enn er et ett etter - -for fordi fra før - -ha hadde han hans har hennar henne hennes her hjå ho hoe honom hoss hossen hun hva hvem hver hvilke hvilken hvis hvor hvordan hvorfor - -i ikke ikkje ikkje ingen ingi inkje inn inni - -ja jeg - -kan kom korleis korso kun kunne kva kvar kvarhelst kven kvi kvifor - -man mange me med medan meg meget mellom men mi min mine mitt mot mykje - -ned no noe noen noka noko nokon nokor nokre nå når - -og også om opp oss over - -på - -samme seg selv si sia sidan siden sin sine sitt sjøl skal skulle slik so som som somme somt så sånn - -til - -um upp ut uten - -var vart varte ved vere verte vi vil ville vore vors vort vår være være vært - -å ->>>>>>> Norwegian language basics + """.split()) \ No newline at end of file From 03abd0c8e698742f93909b760c57adddb040cfbd Mon Sep 17 00:00:00 2001 From: luvogels Date: Tue, 11 Apr 2017 18:29:09 +0200 Subject: [PATCH 018/111] Update __init__.py --- spacy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 0e7e8603f..19cc61c06 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -48,7 +48,6 @@ set_lang_class(nl.Dutch.lang, nl.Dutch) set_lang_class(sv.Swedish.lang, sv.Swedish) set_lang_class(fi.Finnish.lang, fi.Finnish) set_lang_class(bn.Bengali.lang, bn.Bengali) -set_lang_class(nb.Norwegian.lang, nb.Norwegian) set_lang_class(he.Hebrew.lang, he.Hebrew) set_lang_class(nb.Norwegian.lang, nb.Norwegian) From 55e8cade363c128d082507a4e39b06bdc4ca5cf4 Mon Sep 17 00:00:00 2001 From: luvogels Date: Tue, 11 Apr 2017 18:30:15 +0200 Subject: [PATCH 019/111] Update __init__.py --- spacy/nb/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/nb/__init__.py b/spacy/nb/__init__.py index 6389356a7..0392f6520 100644 --- a/spacy/nb/__init__.py +++ b/spacy/nb/__init__.py @@ -12,14 +12,14 @@ from .language_data import * # create Language subclass -class NorwegianBokmal(Language): +class Norwegian(Language): lang = 'nb' # ISO code class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'nb' + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nb' - # override defaults - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - #tag_map = TAG_MAP - stop_words = STOP_WORDS \ No newline at end of file + # override defaults + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + #tag_map = TAG_MAP + stop_words = STOP_WORDS From c7cec7e5e23f50ab4044364fc525ff48ab52da84 Mon Sep 17 00:00:00 2001 From: luvogels Date: Wed, 12 Apr 2017 17:59:55 +0200 Subject: [PATCH 020/111] Update __init__.py --- spacy/nb/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/nb/__init__.py b/spacy/nb/__init__.py index 0392f6520..7d4b471a2 100644 --- a/spacy/nb/__init__.py +++ b/spacy/nb/__init__.py @@ -15,11 +15,11 @@ from .language_data import * class Norwegian(Language): lang = 'nb' # ISO code - class Defaults(Language.Defaults): - lex_attr_getters = dict(Language.Defaults.lex_attr_getters) - lex_attr_getters[LANG] = lambda text: 'nb' + class Defaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda text: 'nb' - # override defaults - tokenizer_exceptions = TOKENIZER_EXCEPTIONS - #tag_map = TAG_MAP - stop_words = STOP_WORDS + # override defaults + tokenizer_exceptions = TOKENIZER_EXCEPTIONS + #tag_map = TAG_MAP + stop_words = STOP_WORDS From 936a29724160f403273ff3e3a3459860d7089b38 Mon Sep 17 00:00:00 2001 From: oeg Date: Wed, 19 Apr 2017 23:30:21 +0200 Subject: [PATCH 021/111] fix(model): Fix tag map for fixing issues with tag SPACE --- spacy/es/tag_map.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/es/tag_map.py b/spacy/es/tag_map.py index bdeb7250f..f0f1cd443 100644 --- a/spacy/es/tag_map.py +++ b/spacy/es/tag_map.py @@ -304,4 +304,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, + "SP": {"morph": "_", "pos": "SPACE"}, } From d8098a8be2d66c71370fca0681a53593ce95682b Mon Sep 17 00:00:00 2001 From: Ben Eyal Date: Thu, 20 Apr 2017 02:22:52 +0300 Subject: [PATCH 022/111] Use `regex` instead of `re` --- spacy/fr/tokenizer_exceptions.py | 2 +- spacy/hu/tokenizer_exceptions.py | 2 +- spacy/language_data/punctuation.py | 2 ++ spacy/util.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/fr/tokenizer_exceptions.py b/spacy/fr/tokenizer_exceptions.py index 43806e270..eef7d789d 100644 --- a/spacy/fr/tokenizer_exceptions.py +++ b/spacy/fr/tokenizer_exceptions.py @@ -13,7 +13,7 @@ from ..symbols import * import os import io -import re +import regex as re def get_exceptions(): diff --git a/spacy/hu/tokenizer_exceptions.py b/spacy/hu/tokenizer_exceptions.py index 85cf72ec9..a6dc47511 100644 --- a/spacy/hu/tokenizer_exceptions.py +++ b/spacy/hu/tokenizer_exceptions.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals -import re +import regex as re from spacy.language_data.punctuation import ALPHA_LOWER, CURRENCY from ..language_data.tokenizer_exceptions import _URL_PATTERN diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index f94d91e80..4bb31c340 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -16,6 +16,8 @@ A Ä À Á  Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ """ +import regex as re +re.DEFAULT_VERSION = re.VERSION1 _UNITS = """ diff --git a/spacy/util.py b/spacy/util.py index 0ccdfbd72..0c7136522 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function import ujson -import re +import regex as re from pathlib import Path import sys import textwrap From 33af52599e3aac9834ec2f104556e6ef67bce968 Mon Sep 17 00:00:00 2001 From: Ben Eyal Date: Thu, 20 Apr 2017 02:25:02 +0300 Subject: [PATCH 023/111] Redefine alphabetic characters For caseless languages (Hebrew, Bengali) all characters are both lowercase and uppercase. --- spacy/language_data/punctuation.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/spacy/language_data/punctuation.py b/spacy/language_data/punctuation.py index 4bb31c340..f23b15bbc 100644 --- a/spacy/language_data/punctuation.py +++ b/spacy/language_data/punctuation.py @@ -1,21 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import re - - -_ALPHA_LOWER = """ -a ä à á â ǎ æ ã å ā ă ą b c ç ć č ĉ ċ c̄ d ð ď e é è ê ë ė ȅ ȩ ẽ ę f g ĝ ğ h i ı -î ï í ī ì ȉ ǐ į ĩ j k ķ l ł ļ m n ñ ń ň ņ o ö ó ò ő ô õ œ ø ō ő ǒ ơ p q r ř ŗ s -ß ś š ş ŝ t ť u ú û ù ú ū ű ǔ ů ų ư v w ŵ x y ÿ ý ỳ ŷ ỹ z ź ž ż þ -""" - - -_ALPHA_UPPER = """ -A Ä À Á  Ǎ Æ Ã Å Ā Ă Ą B C Ç Ć Č Ĉ Ċ C̄ D Ð Ď E É È Ê Ë Ė Ȅ Ȩ Ẽ Ę F G Ĝ Ğ H I İ -Î Ï Í Ī Ì Ȉ Ǐ Į Ĩ J K Ķ L Ł Ļ M N Ñ Ń Ň Ņ O Ö Ó Ò Ő Ô Õ Œ Ø Ō Ő Ǒ Ơ P Q R Ř Ŗ S -Ś Š Ş Ŝ T Ť U Ú Û Ù Ú Ū Ű Ǔ Ů Ų Ư V W Ŵ X Y Ÿ Ý Ỳ Ŷ Ỹ Z Ź Ž Ż Þ -""" import regex as re re.DEFAULT_VERSION = re.VERSION1 @@ -59,9 +44,16 @@ LIST_PUNCT = list(_PUNCT.strip().split()) LIST_HYPHENS = list(_HYPHENS.strip().split()) -ALPHA_LOWER = _ALPHA_LOWER.strip().replace(' ', '').replace('\n', '') -ALPHA_UPPER = _ALPHA_UPPER.strip().replace(' ', '').replace('\n', '') -ALPHA = ALPHA_LOWER + ALPHA_UPPER +BENGALI = r'[\p{L}&&\p{Bengali}]' +HEBREW = r'[\p{L}&&\p{Hebrew}]' +LATIN_LOWER = r'[\p{Ll}&&\p{Latin}]' +LATIN_UPPER = r'[\p{Lu}&&\p{Latin}]' +LATIN = r'[[\p{Ll}||\p{Lu}]&&\p{Latin}]' + + +ALPHA_LOWER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_LOWER])) +ALPHA_UPPER = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN_UPPER])) +ALPHA = '[{}]'.format('||'.join([BENGALI, HEBREW, LATIN])) QUOTES = _QUOTES.strip().replace(' ', '|') From e90e8a3f101552668e75e3e623af2e22865743e1 Mon Sep 17 00:00:00 2001 From: Ben Eyal Date: Thu, 20 Apr 2017 02:25:24 +0300 Subject: [PATCH 024/111] Enable test --- spacy/tests/he/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py index c2504a0e7..62ae84223 100644 --- a/spacy/tests/he/test_tokenizer.py +++ b/spacy/tests/he/test_tokenizer.py @@ -13,7 +13,7 @@ def test_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens): @pytest.mark.parametrize('text,expected_tokens', [ - pytest.mark.xfail(('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.'])), + ('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.']), ('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']), ('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']), ('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']), From f0bcd0babb1493c566c32942043ab78cebd8eb7f Mon Sep 17 00:00:00 2001 From: oeg Date: Thu, 20 Apr 2017 11:36:24 +0200 Subject: [PATCH 025/111] fix(model): Add SPACE to es tag_map. Fixing error in morphology.pyx when SP tag is missing --- spacy/es/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/es/tag_map.py b/spacy/es/tag_map.py index f0f1cd443..dce29c921 100644 --- a/spacy/es/tag_map.py +++ b/spacy/es/tag_map.py @@ -304,5 +304,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "SP": {"morph": "_", "pos": "SPACE"} } From 1f785d25c6cbc1bec58a8a94cbc6c6fe2a9a9db6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Apr 2017 12:28:05 +0200 Subject: [PATCH 026/111] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index d40dedf55..ce4ca62db 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -6,6 +6,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Andreas Grivas, [@andreasgrv](https://github.com/andreasgrv) * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) +* Ben Eyal, [@beneyal](https://github.com/beneyal) * Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader) * Bruno P. Kinoshita, [@kinow](https://github.com/kinow) * Chris DuBois, [@chrisdubois](https://github.com/chrisdubois) From 4a06a2572c89d6a116bb23b7d16236b37b4061a0 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Thu, 20 Apr 2017 13:34:51 +0200 Subject: [PATCH 027/111] Using ftfy for handling broken encoded strings. --- requirements.txt | 1 + setup.py | 3 ++- spacy/cli/model.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6212ab3cd..0108c5621 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +ftfy==4.4.2 pytest>=3.0.6,<4.0.0 diff --git a/setup.py b/setup.py index f7cd0ddcb..e343c3208 100644 --- a/setup.py +++ b/setup.py @@ -248,7 +248,8 @@ def setup_package(): 'ujson>=1.35', 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', - 'regex==2017.4.5'], + 'regex==2017.4.5', + 'ftfy == 4.4.2'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', diff --git a/spacy/cli/model.py b/spacy/cli/model.py index d697df05b..486bbea2f 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -6,6 +6,7 @@ import math from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter +import ftfy from ..vocab import write_binary_vectors from .. import util @@ -41,7 +42,7 @@ def create_model(model_path, vectors_path, vocab, oov_prob): with oov_path.open('w') as f: f.write('%f' % oov_prob) if vectors_path: - vectors_dest = model_path / 'vec.bin' + vectors_dest = vocab_path / 'vec.bin' write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) @@ -76,6 +77,7 @@ def read_clusters(clusters_path): for line in f: try: cluster, word, freq = line.split() + word = ftfy.fix_text(word) except ValueError: continue # If the clusterer has only seen the word a few times, its From 40a8f22ca72f609af7eb74764d1036ffa1834d5e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Apr 2017 15:38:52 +0200 Subject: [PATCH 028/111] Relax version contraint --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0108c5621..42910d1be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,5 +11,5 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 -ftfy==4.4.2 +ftfy>=4.4.2,<5.0.0 pytest>=3.0.6,<4.0.0 From 417f430d23cf2c1ffcc16a886694d27b62c0e04e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Apr 2017 15:39:24 +0200 Subject: [PATCH 029/111] Relax version contstraint --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e343c3208..69ba880eb 100644 --- a/setup.py +++ b/setup.py @@ -249,7 +249,7 @@ def setup_package(): 'dill>=0.2,<0.3', 'requests>=2.13.0,<3.0.0', 'regex==2017.4.5', - 'ftfy == 4.4.2'], + 'ftfy>=4.4.2,<5.0.0'], classifiers=[ 'Development Status :: 5 - Production/Stable', 'Environment :: Console', From 25c70b4cc5364365fd2924f3f7049f15c547114d Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 20 Apr 2017 15:47:17 +0200 Subject: [PATCH 030/111] Move fix_text to spacy.compat (see #1002) --- spacy/cli/model.py | 4 ++-- spacy/compat.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 486bbea2f..3b9a77b93 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -6,9 +6,9 @@ import math from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter -import ftfy from ..vocab import write_binary_vectors +from ..compat import fix_text from .. import util @@ -77,7 +77,7 @@ def read_clusters(clusters_path): for line in f: try: cluster, word, freq = line.split() - word = ftfy.fix_text(word) + word = fix_text(word) except ValueError: continue # If the clusterer has only seen the word a few times, its diff --git a/spacy/compat.py b/spacy/compat.py index d216994cc..8458df7b0 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import six +import ftfy import sys import ujson @@ -38,6 +39,9 @@ elif is_python3: json_dumps = lambda data: ujson.dumps(data, indent=2) +fix_text = lambda text: ftfy.fix_text(text) + + def symlink_to(orig, dest): if is_python2 and is_windows: import subprocess From 4eef200bab12ac76e7e1e5ab47bdf0edae3f234e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 20 Apr 2017 17:02:44 +0200 Subject: [PATCH 031/111] Persist the actions within spacy.parser.cfg --- spacy/syntax/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 0bc9cb4ef..4f240d75f 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -174,13 +174,14 @@ cdef class Parser: if TransitionSystem is None: TransitionSystem = self.TransitionSystem self.vocab = vocab - actions = TransitionSystem.get_actions(**cfg) - self.moves = TransitionSystem(vocab.strings, actions) + cfg['actions'] = TransitionSystem.get_actions(**cfg) + self.moves = TransitionSystem(vocab.strings, cfg['actions']) # TODO: Remove this when we no longer need to support old-style models if isinstance(cfg.get('features'), basestring): cfg['features'] = get_templates(cfg['features']) elif 'features' not in cfg: cfg['features'] = self.feature_templates + self.model = ParserModel(cfg['features']) self.model.l1_penalty = cfg.get('L1', 0.0) self.model.learn_rate = cfg.get('learn_rate', 0.001) From ade920c30f1b0110daa222f6f2d248374f693fd1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 23 Apr 2017 12:09:35 +0200 Subject: [PATCH 032/111] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f1053405e..837b0a469 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -40,7 +40,7 @@ To distinguish issues that are opened by us, the maintainers, we usually add a | [`performance`](https://github.com/explosion/spaCy/labels/performance) | Accuracy, speed and memory use problems | | [`tests`](https://github.com/explosion/spaCy/labels/tests) | Missing or incorrect [tests](spacy/tests) | | [`docs`](https://github.com/explosion/spaCy/labels/docs), [`examples`](https://github.com/explosion/spaCy/labels/examples) | Issues related to the [documentation](https://spacy.io/docs) and [examples](spacy/examples) | -| [`models`](https://github.com/explosion/spaCy/labels/models), [`english`](https://github.com/explosion/spaCy/labels/english), [`german`](https://github.com/explosion/spaCy/labels/german) | Issues related to the specific [models](https://github.com/explosion/spacy-models), languages and data | +| [`models`](https://github.com/explosion/spaCy/labels/models), `language / [name]` | Issues related to the specific [models](https://github.com/explosion/spacy-models), languages and data | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | From 3a9710f35698a55822d80ded5d66239e6dcdba8a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 15:57:53 +0200 Subject: [PATCH 033/111] Pass dev_scores to print_progress correctly (resolves #1008) Only read scores attribute if command is used with dev_data, otherwise default dev_scores to empty dict. --- spacy/cli/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 3900c7f39..8557019c6 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -62,10 +62,10 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data) if dev_data else [] + dev_scores = trainer.evaluate(dev_data).scores if dev_data else {} print_progress(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, - **dev_scores.scores) + **dev_scores) def evaluate(Language, gold_tuples, output_path): From 2bfec1a4f8b0c3218b731084a1a1619c72d69967 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 15:58:38 +0200 Subject: [PATCH 034/111] Add note on languages with non-latin characters (see #996) --- website/docs/usage/adding-languages.jade | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 0c98cc5ca..50b626b99 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -98,6 +98,17 @@ p | so that Python functions can be used to help you generalise and combine | the data as you require. ++infobox("For languages with non-latin characters") + | In order for the tokenizer to split suffixes, prefixes and infixes, spaCy + | needs to know the language's character set. If the language you're adding + | uses non-latin characters, you might need to add the required character + | classes to the global + | #[+src(gh("spacy", "spacy/language_data/punctuation.py")) punctuation.py]. + | spaCy uses the #[+a("https://pypi.python.org/pypi/regex/") #[code regex] library] + | to keep this simple and readable. If the language requires very specific + | punctuation rules, you should consider overwriting the default regular + | expressions with your own in the language's #[code Defaults]. + +h(3, "stop-words") Stop words p From 040751ad17c96a6e6bf2c0d70e7e789cbd8dd7d6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 16:28:55 +0200 Subject: [PATCH 035/111] Remove xfail on Test #910 --- spacy/tests/regression/test_issue910.py | 27 +++++++++++-------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/spacy/tests/regression/test_issue910.py b/spacy/tests/regression/test_issue910.py index 9b2b2287b..3790d429b 100644 --- a/spacy/tests/regression/test_issue910.py +++ b/spacy/tests/regression/test_issue910.py @@ -70,7 +70,6 @@ def temp_save_model(model): -@pytest.mark.xfail @pytest.mark.models def test_issue910(train_data, additional_entity_types): '''Test that adding entities and resuming training works passably OK. @@ -85,11 +84,10 @@ def test_issue910(train_data, additional_entity_types): ents_before_train = [(ent.label_, ent.text) for ent in doc.ents] # Fine tune the ner model for entity_type in additional_entity_types: - if entity_type not in nlp.entity.cfg['actions']['1']: - nlp.entity.add_label(entity_type) + nlp.entity.add_label(entity_type) - nlp.entity.learn_rate = 0.001 - for itn in range(4): + nlp.entity.model.learn_rate = 0.001 + for itn in range(10): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) @@ -101,13 +99,12 @@ def test_issue910(train_data, additional_entity_types): # Load the fine tuned model loaded_ner = EntityRecognizer.load(model_dir, nlp.vocab) - for entity_type in additional_entity_types: - if entity_type not in loaded_ner.cfg['actions']['1']: - loaded_ner.add_label(entity_type) - - doc = nlp(u"I am looking for a restaurant in Berlin", entity=False) - nlp.tagger(doc) - loaded_ner(doc) - - ents_after_train = [(ent.label_, ent.text) for ent in doc.ents] - assert ents_before_train == ents_after_train + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + loaded_ner(doc) + ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} + for start, end, label in entity_offsets: + if (start, end) not in ents: + print(ents) + assert ents[(start, end)] == label From 4d2a659c52a7cb52aa74120257563a6a99d18c44 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 17:05:53 +0200 Subject: [PATCH 036/111] Fix json dump for Python3 --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 854b0ebeb..dc2e98414 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -195,7 +195,7 @@ class Language(object): if directory.exists(): shutil.rmtree(str(directory)) directory.mkdir() - with (directory / 'config.json').open('wb') as file_: + with (directory / 'config.json').open('w') as file_: data = json_dumps(config) file_.write(data) if not (path / 'vocab').exists(): From 5d8af404455bd8ed54ad4cfc99b44bb8beda3cc1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 17:06:30 +0200 Subject: [PATCH 037/111] Add test for Issue #999 --- spacy/tests/regression/test_issue999.py | 75 +++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 spacy/tests/regression/test_issue999.py diff --git a/spacy/tests/regression/test_issue999.py b/spacy/tests/regression/test_issue999.py new file mode 100644 index 000000000..6e3e9af3f --- /dev/null +++ b/spacy/tests/regression/test_issue999.py @@ -0,0 +1,75 @@ +from __future__ import unicode_literals +import json +import os +import random +import contextlib +import shutil +import pytest +import tempfile +from pathlib import Path + + +import pathlib +from ...gold import GoldParse +from ...pipeline import EntityRecognizer +from ...en import English + +try: + unicode +except NameError: + unicode = str + + +@pytest.fixture +def train_data(): + return [ + ["hey",[]], + ["howdy",[]], + ["hey there",[]], + ["hello",[]], + ["hi",[]], + ["i'm looking for a place to eat",[]], + ["i'm looking for a place in the north of town",[[31,36,"location"]]], + ["show me chinese restaurants",[[8,15,"cuisine"]]], + ["show me chines restaurants",[[8,14,"cuisine"]]], + ] + + +@contextlib.contextmanager +def temp_save_model(model): + model_dir = Path(tempfile.mkdtemp()) + model.save_to_directory(model_dir) + yield model_dir + shutil.rmtree(model_dir.as_posix()) + + +def test_issue999(train_data): + '''Test that adding entities and resuming training works passably OK. + There are two issues here: + + 1) We have to readd labels. This isn't very nice. + 2) There's no way to set the learning rate for the weight update, so we + end up out-of-scale, causing it to learn too fast. + ''' + nlp = English(entity=False) + nlp.entity = EntityRecognizer(nlp.vocab, features=English.Defaults.entity_features) + for _, offsets in train_data: + for start, end, ent_type in offsets: + nlp.entity.add_label(ent_type) + for itn in range(10): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + doc = nlp.make_doc(raw_text) + gold = GoldParse(doc, entities=entity_offsets) + loss = nlp.entity.update(doc, gold) + + with temp_save_model(nlp) as model_dir: + nlp2 = English(path=model_dir) + + for raw_text, entity_offsets in train_data: + doc = nlp2(raw_text) + ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} + for start, end, label in entity_offsets: + if (start, end) not in ents: + print(ents) + assert ents[(start, end)] == label From 60703cede53b79a68352ff3a48fe8bf23dd1c8fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 17:56:39 +0200 Subject: [PATCH 038/111] Ensure noun chunks can't be nested. Closes #955 --- spacy/syntax/iterators.pyx | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index c3ba2ca92..e1c44da7f 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -10,15 +10,22 @@ def english_noun_chunks(obj): Works on both Doc and Span. """ labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', - 'attr', 'ROOT', 'root'] + 'attr', 'ROOT'] doc = obj.doc # Ensure works on both Doc and Span. np_deps = [doc.vocab.strings[label] for label in labels] conj = doc.vocab.strings['conj'] np_label = doc.vocab.strings['NP'] + seen = set() for i, word in enumerate(obj): if word.pos not in (NOUN, PROPN, PRON): continue + # Prevent nested chunks from being produced + if word.i in seen: + continue if word.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.i+1)) yield word.left_edge.i, word.i+1, np_label elif word.dep == conj: head = word.head @@ -26,6 +33,9 @@ def english_noun_chunks(obj): head = head.head # If the head is an NP, and we're coordinated to it, we're an NP if head.dep in np_deps: + if any(w.i in seen for w in word.subtree): + continue + seen.update(j for j in range(word.left_edge.i, word.i+1)) yield word.left_edge.i, word.i+1, np_label From 874a3cbb073b8bf79925c3c620e09245afa2157e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 17:57:01 +0200 Subject: [PATCH 039/111] Add test for Issue #955 --- spacy/tests/regression/test_issue995.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 spacy/tests/regression/test_issue995.py diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py new file mode 100644 index 000000000..715c24607 --- /dev/null +++ b/spacy/tests/regression/test_issue995.py @@ -0,0 +1,20 @@ +import pytest +from ... import load as load_spacy + +@pytest.fixture +def doc(): + nlp = load_spacy('en') + return nlp('Does flight number three fifty-four require a connecting flight' + ' to get to Boston?') + + +@pytest.mark.models +def test_issue955(doc): + '''Test that we don't have any nested noun chunks''' + seen_tokens = set() + for np in doc.noun_chunks: + print(np.text, np.root.text, np.root.dep_, np.root.tag_) + for word in np: + key = (word.i, word.text) + assert key not in seen_tokens + seen_tokens.add(key) From d2436dc17be25b0f3ad220212c542e66c44bb512 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 18:14:37 +0200 Subject: [PATCH 040/111] Update fix for Issue #999 --- spacy/syntax/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4f240d75f..57606dc76 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -146,7 +146,7 @@ cdef class Parser: if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') # TODO: remove this shim when we don't have to support older data - for action_name, labels in dict(cfg['actions']).items(): + for action_name, labels in dict(cfg.get('actions', {})).items(): # We need this to be sorted if isinstance(labels, dict): labels = list(sorted(labels.keys())) From e033c86a642487076e1760f694439716e8e8943a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 21:03:43 +0200 Subject: [PATCH 041/111] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 5e438c7af..e6999b136 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.0' +__version__ = '1.8.1' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 401045433c3c2ab9957af97914a4f9ebb0949055 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 21:05:39 +0200 Subject: [PATCH 042/111] Simplify compat.fix_text --- spacy/compat.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 8458df7b0..1ca8a59fe 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -23,6 +23,8 @@ is_windows = sys.platform.startswith('win') is_linux = sys.platform.startswith('linux') is_osx = sys.platform == 'darwin' +fix_text = ftfy.fix_text + if is_python2: bytes_ = str @@ -39,9 +41,6 @@ elif is_python3: json_dumps = lambda data: ujson.dumps(data, indent=2) -fix_text = lambda text: ftfy.fix_text(text) - - def symlink_to(orig, dest): if is_python2 and is_windows: import subprocess From 83f66947dc06409d6b4260f9256147da518544c3 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 21:06:36 +0200 Subject: [PATCH 043/111] Rename test_download to test_cli --- spacy/tests/{test_download.py => test_cli.py} | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) rename spacy/tests/{test_download.py => test_cli.py} (79%) diff --git a/spacy/tests/test_download.py b/spacy/tests/test_cli.py similarity index 79% rename from spacy/tests/test_download.py rename to spacy/tests/test_cli.py index 4bbe190df..189a238fc 100644 --- a/spacy/tests/test_download.py +++ b/spacy/tests/test_cli.py @@ -2,17 +2,18 @@ from __future__ import unicode_literals from ..cli.download import download, get_compatibility, get_version, check_error_depr + import pytest @pytest.mark.parametrize('model', ['en_core_web_md']) -def test_download_get_matching_version_succeeds(model): +def test_cli_download_get_matching_version_succeeds(model): comp = { model: ['1.7.0', '0.100.0'] } assert get_version(model, comp) @pytest.mark.parametrize('model', ['en_core_web_md']) -def test_download_get_matching_version_fails(model): +def test_cli_download_get_matching_version_fails(model): diff_model = 'test_' + model comp = { diff_model: ['1.7.0', '0.100.0'] } with pytest.raises(SystemExit): @@ -20,6 +21,6 @@ def test_download_get_matching_version_fails(model): @pytest.mark.parametrize('model', [False, None, '', 'all']) -def test_download_no_model_depr_error(model): +def test_cli_download_no_model_depr_error(model): with pytest.raises(SystemExit): check_error_depr(model) From 012ea594d171041a9ff064783b7c0a392a827d4c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 21:06:46 +0200 Subject: [PATCH 044/111] Add file for misc tests --- spacy/tests/test_misc.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 spacy/tests/test_misc.py diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py new file mode 100644 index 000000000..c55e587e7 --- /dev/null +++ b/spacy/tests/test_misc.py @@ -0,0 +1,19 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from ..util import ensure_path + +from pathlib import Path +import pytest + + +@pytest.mark.parametrize('text', ['hello/world', 'hello world']) +def test_util_ensure_path_succeeds(text): + path = ensure_path(text) + assert isinstance(path, Path) + + +@pytest.mark.parametrize('text', [b'hello/world', True, False, None]) +def test_util_ensure_path_fails(text): + path = ensure_path(text) + assert not isinstance(path, Path) From 42305bc519653c1f46bc0acb2ced8196ddc02025 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 21:21:41 +0200 Subject: [PATCH 045/111] Remove unnecessary test --- spacy/tests/test_misc.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index c55e587e7..41c4efb8a 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -11,9 +11,3 @@ import pytest def test_util_ensure_path_succeeds(text): path = ensure_path(text) assert isinstance(path, Path) - - -@pytest.mark.parametrize('text', [b'hello/world', True, False, None]) -def test_util_ensure_path_fails(text): - path = ensure_path(text) - assert not isinstance(path, Path) From d0e19267e8d14463a50f5ca24367015ca50c97d9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 21:24:43 +0200 Subject: [PATCH 046/111] Create directory if missing in save_to_directory --- spacy/language.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/language.py b/spacy/language.py index dc2e98414..da02df53d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -366,6 +366,8 @@ class Language(object): } path = util.ensure_path(path) + if not path.exists(): + path.mkdir() self.setup_directory(path, **configs) strings_loc = path / 'vocab' / 'strings.json' From 9beb216c0e5744976488d0973c26742af11ab4f1 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 23 Apr 2017 22:00:44 +0200 Subject: [PATCH 047/111] Update changelog --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index d08860fb8..3979b2e37 100644 --- a/README.rst +++ b/README.rst @@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes `v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading `v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands `v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes @@ -351,6 +352,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 From 4f9657b42b79af4c640b67e1d2740b9d901d5aba Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 22:27:10 +0200 Subject: [PATCH 048/111] Fix reporting if no dev data with train --- spacy/cli/train.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 8557019c6..ce4c77416 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals, division, print_function import json +from collections import defaultdict from ..util import ensure_path from ..scorer import Scorer @@ -62,7 +63,7 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) - dev_scores = trainer.evaluate(dev_data).scores if dev_data else {} + dev_scores = trainer.evaluate(dev_data).scores if dev_data else defaultdict(float) print_progress(itn, trainer.nlp.parser.model.nr_weight, trainer.nlp.parser.model.nr_active_feat, **dev_scores) From 3973af2d155d4840c8cf6f44fe2643d77c4b8a3c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Apr 2017 22:59:34 +0200 Subject: [PATCH 049/111] Make training test less flakey --- spacy/tests/regression/test_issue999.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue999.py b/spacy/tests/regression/test_issue999.py index 6e3e9af3f..8726adede 100644 --- a/spacy/tests/regression/test_issue999.py +++ b/spacy/tests/regression/test_issue999.py @@ -56,7 +56,7 @@ def test_issue999(train_data): for _, offsets in train_data: for start, end, ent_type in offsets: nlp.entity.add_label(ent_type) - for itn in range(10): + for itn in range(100): random.shuffle(train_data) for raw_text, entity_offsets in train_data: doc = nlp.make_doc(raw_text) From 70a43858e16184c2bc408b404a3b11dcad4f9526 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Apr 2017 00:06:30 +0200 Subject: [PATCH 050/111] Fix flakey test --- spacy/tests/regression/test_issue999.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/spacy/tests/regression/test_issue999.py b/spacy/tests/regression/test_issue999.py index 8726adede..e32d5671d 100644 --- a/spacy/tests/regression/test_issue999.py +++ b/spacy/tests/regression/test_issue999.py @@ -70,6 +70,8 @@ def test_issue999(train_data): doc = nlp2(raw_text) ents = {(ent.start_char, ent.end_char): ent.label_ for ent in doc.ents} for start, end, label in entity_offsets: - if (start, end) not in ents: - print(ents) - assert ents[(start, end)] == label + if (start, end) in ents: + assert ents[(start, end)] == label + break + else: + raise Exception(ents) From 65f10b53e54396472d4c4da098f559bec7bb9287 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Apr 2017 00:25:55 +0200 Subject: [PATCH 051/111] Fix test --- spacy/tests/regression/test_issue999.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/regression/test_issue999.py b/spacy/tests/regression/test_issue999.py index e32d5671d..0886acbaf 100644 --- a/spacy/tests/regression/test_issue999.py +++ b/spacy/tests/regression/test_issue999.py @@ -56,6 +56,7 @@ def test_issue999(train_data): for _, offsets in train_data: for start, end, ent_type in offsets: nlp.entity.add_label(ent_type) + nlp.entity.model.learn_rate = 0.001 for itn in range(100): random.shuffle(train_data) for raw_text, entity_offsets in train_data: @@ -74,4 +75,5 @@ def test_issue999(train_data): assert ents[(start, end)] == label break else: - raise Exception(ents) + if entity_offsets: + raise Exception(ents) From c4be9c36fe969a590e875aaba1d33ce940083967 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Apr 2017 10:09:01 +0200 Subject: [PATCH 052/111] Fix unicode header in tests --- spacy/tests/regression/test_issue758.py | 1 + spacy/tests/regression/test_issue995.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/spacy/tests/regression/test_issue758.py b/spacy/tests/regression/test_issue758.py index c0bbb6945..a059f095f 100644 --- a/spacy/tests/regression/test_issue758.py +++ b/spacy/tests/regression/test_issue758.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals from ... import load as load_spacy from ...attrs import LEMMA from ...matcher import merge_phrase diff --git a/spacy/tests/regression/test_issue995.py b/spacy/tests/regression/test_issue995.py index 715c24607..633e96fb5 100644 --- a/spacy/tests/regression/test_issue995.py +++ b/spacy/tests/regression/test_issue995.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import pytest from ... import load as load_spacy From f997bceb07a3755302f90d2779a87a7c954309c5 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 24 Apr 2017 11:55:41 +0200 Subject: [PATCH 053/111] Make object of the deep learning tutorial clearer This is a great tutorial, but I think it is weirdly explained in the current form. The largest part of the code is about implementing the actual sentiment analysis model, not about counting entities. (which is not even present in the `deep_learning_keras.py` script in `examples`) --- website/docs/usage/deep-learning.jade | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/deep-learning.jade b/website/docs/usage/deep-learning.jade index a1252a4fe..fec01b4ba 100644 --- a/website/docs/usage/deep-learning.jade +++ b/website/docs/usage/deep-learning.jade @@ -4,9 +4,9 @@ include ../../_includes/_mixins p | In this example, we'll be using #[+a("https://keras.io/") Keras], as - | it's the most popular deep learning library for Python. Let's assume - | you've written a custom sentiment analysis model that predicts whether a - | document is positive or negative. Now you want to find which entities + | it's the most popular deep learning library for Python. Using Keras, + | we will write a custom sentiment analysis model that predicts whether a + | document is positive or negative. Then, we will use it to find which entities | are commonly associated with positive or negative documents. Here's a | quick example of how that can look at runtime. From ae2b77db1b45568511617fdc08cb1ff53208d2e1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 14:00:40 +0200 Subject: [PATCH 054/111] Fix info on naming conventions --- website/docs/usage/saving-loading.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index a3edfce50..c4eb08f04 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -61,7 +61,7 @@ p | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. | If you're creating the package manually, keep in mind that the directories | need to be named according to the naming conventions of - | #[code [language]_[type]] and #[code [language]_[type]-[version]]. The + | #[code [language]_[name]] and #[code [language]_[name]-[version]]. The | #[code lang] setting in the meta.json is also used to create the | respective #[code Language] class in spaCy, which will later be returned | by the model's #[code load()] method. From e6bdf5bc5cf6d030241e630af9adf3c70f7892ff Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 14:01:15 +0200 Subject: [PATCH 055/111] Update adding language / training docs (see #966) Add data examples and more info on training and CLI commands --- website/docs/usage/adding-languages.jade | 163 ++++++++++++++++------- 1 file changed, 117 insertions(+), 46 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 50b626b99..67ac8d610 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -27,9 +27,10 @@ p | #[a(href="#brown-clusters") Brown clusters] and | #[a(href="#word-vectors") word vectors]. + +item + | #[strong Set up] a #[a(href="#model-directory") model direcory] and #[strong train] the #[a(href="#train-tagger-parser") tagger and parser]. + p - | Once you have the tokenizer and vocabulary, you can - | #[+a("/docs/usage/training") train the tagger, parser and entity recognizer]. | For some languages, you may also want to develop a solution for | lemmatization and morphological analysis. @@ -406,12 +407,111 @@ p | by linear models, while the word vectors are useful for lexical | similarity models and deep learning. ++h(3, "word-frequencies") Word frequencies + +p + | To generate the word frequencies from a large, raw corpus, you can use the + | #[+src(gh("spacy-dev-resources", "training/word_freqs.py")) word_freqs.py] + | script from the spaCy developer resources. Note that your corpus should + | not be preprocessed (i.e. you need punctuation for example). The + | #[+a("/docs/usage/cli#model") #[code model] command] expects a + | tab-separated word frequencies file with three columns: + ++list("numbers") + +item The number of times the word occurred in your language sample. + +item The number of distinct documents the word occurred in. + +item The word itself. + +p + | An example word frequencies file could look like this: + ++code("es_word_freqs.txt", "text"). + 6361109 111 Aunque + 23598543 111 aunque + 10097056 111 claro + 193454 111 aro + 7711123 111 viene + 12812323 111 mal + 23414636 111 momento + 2014580 111 felicidad + 233865 111 repleto + 15527 111 eto + 235565 111 deliciosos + 17259079 111 buena + 71155 111 Anímate + 37705 111 anímate + 33155 111 cuéntanos + 2389171 111 cuál + 961576 111 típico + +p + | You should make sure you use the spaCy tokenizer for your + | language to segment the text for your word frequencies. This will ensure + | that the frequencies refer to the same segmentation standards you'll be + | using at run-time. For instance, spaCy's English tokenizer segments + | "can't" into two tokens. If we segmented the text by whitespace to + | produce the frequency counts, we'll have incorrect frequency counts for + | the tokens "ca" and "n't". + ++h(3, "brown-clusters") Training the Brown clusters + +p + | spaCy's tagger, parser and entity recognizer are designed to use + | distributional similarity features provided by the + | #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm]. + | You should train a model with between 500 and 1000 clusters. A minimum + | frequency threshold of 10 usually works well. + +p + | An example clusters file could look like this: + ++code("es_clusters.data", "text"). + 0000 Vestigial 1 + 0000 Vesturland 1 + 0000 Veyreau 1 + 0000 Veynes 1 + 0000 Vexilografía 1 + 0000 Vetrigne 1 + 0000 Vetónica 1 + 0000 Asunden 1 + 0000 Villalambrús 1 + 0000 Vichuquén 1 + 0000 Vichtis 1 + 0000 Vichigasta 1 + 0000 VAAH 1 + 0000 Viciebsk 1 + 0000 Vicovaro 1 + 0000 Villardeveyo 1 + 0000 Vidala 1 + 0000 Videoguard 1 + 0000 Vedás 1 + 0000 Videocomunicado 1 + 0000 VideoCrypt 1 + ++h(3, "word-vectors") Training the word vectors + +p + | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related + | algorithms let you train useful word similarity models from unlabelled + | text. This is a key part of using + | #[+a("/docs/usage/deep-learning") deep learning] for NLP with limited + | labelled data. The vectors are also useful by themselves – they power + | the #[code .similarity()] methods in spaCy. For best results, you should + | pre-process the text with spaCy before training the Word2vec model. This + | ensures your tokenization will match. + +p + | You can use our + | #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script], + | which pre-processes the text with your language-specific tokenizer and + | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. + | The #[code vectors.bin] file should consist of one word and vector per line. + ++h(2, "model-directory") Setting up a model directory + p | Once you've collected the word frequencies, Brown clusters and word | vectors files, you can use the - | #[+src(gh("spacy-dev-resources", "training/init.py")) init.py] - | script from our - | #[+a(gh("spacy-dev-resources")) developer resources], or use the new | #[+a("/docs/usage/cli#model") #[code model] command] to create a data | directory: @@ -438,49 +538,20 @@ p | loaded. By default, the command expects to be able to find your language | class using #[code spacy.util.get_lang_class(lang_id)]. -+h(3, "word-frequencies") Word frequencies + ++h(2, "train-tagger-parser") Training the tagger and parser p - | The #[+a("/docs/usage/cli#model") #[code model] command] expects a - | tab-separated word frequencies file with three columns: - -+list("numbers") - +item The number of times the word occurred in your language sample. - +item The number of distinct documents the word occurred in. - +item The word itself. + | You can now train the model using a corpus for your language annotated + | with #[+a("http://universaldependencies.org/") Universal Dependencies]. + | If your corpus uses the connlu format, you can use the + | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to + | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. p - | You should make sure you use the spaCy tokenizer for your - | language to segment the text for your word frequencies. This will ensure - | that the frequencies refer to the same segmentation standards you'll be - | using at run-time. For instance, spaCy's English tokenizer segments - | "can't" into two tokens. If we segmented the text by whitespace to - | produce the frequency counts, we'll have incorrect frequency counts for - | the tokens "ca" and "n't". + | Once you have your UD corpus transformed into JSON, you can train your + | model use the using spaCy's + | #[+a("/docs/usage/cli#train") #[code train] command]: -+h(3, "brown-clusters") Training the Brown clusters - -p - | spaCy's tagger, parser and entity recognizer are designed to use - | distributional similarity features provided by the - | #[+a("https://github.com/percyliang/brown-cluster") Brown clustering algorithm]. - | You should train a model with between 500 and 1000 clusters. A minimum - | frequency threshold of 10 usually works well. - -+h(3, "word-vectors") Training the word vectors - -p - | #[+a("https://en.wikipedia.org/wiki/Word2vec") Word2vec] and related - | algorithms let you train useful word similarity models from unlabelled - | text. This is a key part of using - | #[+a("/docs/usage/deep-learning") deep learning] for NLP with limited - | labelled data. The vectors are also useful by themselves – they power - | the #[code .similarity()] methods in spaCy. For best results, you should - | pre-process the text with spaCy before training the Word2vec model. This - | ensures your tokenization will match. - -p - | You can use our - | #[+src(gh("spacy-dev-resources", "training/word_vectors.py")) word vectors training script], - | which pre-processes the text with your language-specific tokenizer and - | trains the model using #[+a("https://radimrehurek.com/gensim/") Gensim]. ++code(false, "bash"). + python -m spacy train [lang] [output_dir] [train_data] [dev_data] [--n_iter] [--parser_L1] [--no_tagger] [--no_parser] [--no_ner] From 99558023fd0825caf5f50441e8553feb423d2b1e Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 16:02:44 +0200 Subject: [PATCH 056/111] Add divider table row style --- website/_harp.json | 2 +- website/assets/css/_components/_tables.sass | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 03fcbb956..0fd4d4a07 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -55,7 +55,7 @@ } }, - "V_CSS": "1.4", + "V_CSS": "1.5", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", diff --git a/website/assets/css/_components/_tables.sass b/website/assets/css/_components/_tables.sass index f87169fb1..6ade4f6a1 100644 --- a/website/assets/css/_components/_tables.sass +++ b/website/assets/css/_components/_tables.sass @@ -20,6 +20,9 @@ @extend .u-text-label color: $color-theme + &.c-table__row--divider + border-top: 2px solid $color-theme + //- Table cell From 6c4f3c6fc2a014a51d4f2b69fe670686ed1cea12 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 16:02:59 +0200 Subject: [PATCH 057/111] Allow styles arguments on row mixin --- website/_includes/_mixins.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index ba5a9297e..9ca60b87d 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -198,8 +198,8 @@ mixin table(head) //- Table row (only used within +table) -mixin row() - tr.c-table__row&attributes(attributes) +mixin row(...style) + tr.c-table__row(class=prefixArgs(style, "c-table__row"))&attributes(attributes) block From 5d598b67476a072fdd4fc088b7f9e2ed24a10f1f Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 16:03:05 +0200 Subject: [PATCH 058/111] Add star icon --- website/assets/img/icons.svg | 3 +++ 1 file changed, 3 insertions(+) diff --git a/website/assets/img/icons.svg b/website/assets/img/icons.svg index 224224084..f62901592 100644 --- a/website/assets/img/icons.svg +++ b/website/assets/img/icons.svg @@ -24,5 +24,8 @@ + + + From 5a470367df67bafc4960bab85192655c7a456736 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 16:03:17 +0200 Subject: [PATCH 059/111] Add mixin for model row in model docs --- website/_includes/_mixins.jade | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 9ca60b87d..9766f5c5a 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -283,3 +283,21 @@ mixin card-item(title, details) if details.author br span.u-text-small.u-color-subtle by #{details.author} + + +//- Model row for models table + +mixin model-row(name, lang, procon, size, license, default_model, divider) + - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } + + +row(divider ? "divider": null) + +cell #[code=name] + if default_model + | #[span.u-color-theme(title="default model") #[+icon("star", 16)]] + +cell=lang + each icon in procon + +cell.u-text-center #[+procon(icon ? "pro" : "con")] + +cell.u-text-right=size + +cell + if license in licenses + +a(licenses[license])=license From c2006166d39895098d70269d0bb9052d3bde15d1 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 16:03:41 +0200 Subject: [PATCH 060/111] Update list of available models and info --- website/docs/usage/models.jade | 61 +++++++++++++--------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 9d50dcbc0..69142b351 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -13,14 +13,6 @@ p | internal alias that tells spaCy where to find the data files for a specific | model name. -+infobox("Important note") - | Due to improvements in the English lemmatizer in v1.7.0, you need to - | #[strong download the new English models]. The German model is still - | compatible. If you've trained statistical models that use spaCy's - | annotations, you should #[strong retrain your models after updating spaCy]. - | If you don't retrain your models, you may suffer train/test skew, which - | might decrease your accuracy. - +aside-code("Quickstart"). # Install spaCy and download English model pip install spacy @@ -31,43 +23,38 @@ p nlp = spacy.load('en') doc = nlp(u'This is a sentence.') ++infobox("Important note") + | Due to improvements in the English lemmatizer in v1.7.0, you need to + | #[strong download the new English models]. The German model is still + | compatible. If you've trained statistical models that use spaCy's + | annotations, you should #[strong retrain your models after updating spaCy]. + | If you don't retrain your models, you may suffer train/test skew, which + | might decrease your accuracy. + +h(2, "available") Available models -+table(["Name", "Size", "Description"]) - +row - +cell #[code en_core_web_sm] - +cell 50 MB - +cell Vocab, syntax, entities, word vectors #[+tag default] - - +row - +cell #[code en_core_web_md] - +cell 1 GB - +cell Vocab, syntax, entities, word vectors - - +row - +cell #[code en_depent_web_md] - +cell 328 MB - +cell Vocab, syntax, entities - - +row - +cell #[code en_vectors_glove_md] - +cell 727 MB - +cell - | #[+a("http://nlp.stanford.edu/projects/glove/") GloVe] Common - | Crawl vectors - - +row - +cell #[code de_core_news_md] - +cell 645 MB - +cell Vocab, syntax, entities, word vectors #[+tag default] - p + | Model differences are mostly statistical. In general, we do expect larger + | models to be "better" and more accurate overall. Ultimately, it depends on + | your use case and requirements, and we recommend starting with the default + | models (marked with a star below). + ++aside | Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub], | attached to individual releases. They can be downloaded and loaded manually, | or using spaCy's #[code download] and #[code link] commands. All models | follow the naming convention of #[code [language]_[type]_[genre]_[size]]. + | #[br]#[br] -+button(gh("spacy-models") + "/releases", true, "primary") View models + +button(gh("spacy-models"), true, "primary").u-text-tag + | View model releases + ++table(["Name", "Language", "Voc", "Dep", "Ent", "Vec", "Size", "License"]) + +model-row("en_core_web_sm", "English", [1, 1, 1, 1], "50 MB", "CC BY-SA", true) + +model-row("en_core_web_md", "English", [1, 1, 1, 1], "1 GB", "CC BY-SA") + +model-row("en_depent_web_md", "English", [1, 1, 1, 0], "328 MB", "CC BY-SA") + +model-row("en_vectors_glove_md", "English", [0, 0, 0, 1], "727 MB", "CC BY-SA") + +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true) +h(2, "download") Downloading models From 527d51ac9a284a0fc702e59a6467237216321be7 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 18:00:28 +0200 Subject: [PATCH 061/111] Fetch shortcuts from GitHub and improve error handling --- spacy/about.py | 2 +- spacy/cli/download.py | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index e6999b136..6498f80ee 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -13,4 +13,4 @@ __license__ = 'MIT' __docs__ = 'https://spacy.io/docs/usage' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' -__shortcuts__ = {'en': 'en_core_web_sm', 'de': 'de_core_news_md', 'vectors': 'en_vectors_glove_md'} +__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts.json' diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 06333eabf..0419de118 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -17,37 +17,45 @@ def download(model=None, direct=False): if direct: download_model('{m}/{m}.tar.gz'.format(m=model)) else: - model_name = about.__shortcuts__[model] if model in about.__shortcuts__ else model + model_name = check_shortcut(model) compatibility = get_compatibility() version = get_version(model_name, compatibility) download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, v=version)) link_package(model_name, model, force=True) -def get_compatibility(): - version = about.__version__ - r = requests.get(about.__compatibility__) +def get_json(url, desc): + r = requests.get(url) if r.status_code != 200: util.sys_exit( - "Couldn't fetch compatibility table. Please find the right model for " - "your spaCy installation (v{v}), and download it manually:".format(v=version), + "Couldn't fetch {d}. Please find the right model for your spaCy " + "installation (v{v}), and download it manually:".format(d=desc, v=about.__version__), "python -m spacy.download [full model name + version] --direct", title="Server error ({c})".format(c=r.status_code)) + return r.json() - comp = r.json()['spacy'] + +def check_shortcut(model): + shortcuts = get_json(about.__shortcuts__, "available shortcuts") + return shortcuts.get(model, model) + + +def get_compatibility(): + version = about.__version__ + comp_table = get_json(about.__compatibility__, "compatibility table") + comp = comp_table['spacy'] if version not in comp: util.sys_exit( "No compatible models found for v{v} of spaCy.".format(v=version), title="Compatibility error") - else: - return comp[version] + return comp[version] def get_version(model, comp): if model not in comp: util.sys_exit( "No compatible model found for " - "{m} (spaCy v{v}).".format(m=model, v=about.__version__), + "'{m}' (spaCy v{v}).".format(m=model, v=about.__version__), title="Compatibility error") return comp[model][0] From cbfe4920bb88623803f7210dfa90e90d6cc8b08a Mon Sep 17 00:00:00 2001 From: luvogels Date: Wed, 26 Apr 2017 18:02:34 +0200 Subject: [PATCH 062/111] Added contributor agreement and pull request doc --- .github/Alpha_support_Norwegian bokmål.md | 20 ++++ .github/contributors/luvogels.md | 106 ++++++++++++++++++++++ 2 files changed, 126 insertions(+) create mode 100644 .github/Alpha_support_Norwegian bokmål.md create mode 100644 .github/contributors/luvogels.md diff --git a/.github/Alpha_support_Norwegian bokmål.md b/.github/Alpha_support_Norwegian bokmål.md new file mode 100644 index 000000000..0917810ab --- /dev/null +++ b/.github/Alpha_support_Norwegian bokmål.md @@ -0,0 +1,20 @@ + + +## Description + + +Added alpha support for Norwegian bokmål. + +## Types of changes + +- [ ] **Bug fix** (non-breaking change fixing an issue) +- [x] **New feature** (non-breaking change adding functionality to spaCy) +- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) +- [ ] **Documentation** (addition to documentation of spaCy) + +## Checklist: + +- [ ] My change requires a change to spaCy's documentation. +- [ ] I have updated the documentation accordingly. +- [ ] I have added tests to cover my changes. +- [ ] All new and existing tests passed. diff --git a/.github/contributors/luvogels.md b/.github/contributors/luvogels.md new file mode 100644 index 000000000..c915d48bf --- /dev/null +++ b/.github/contributors/luvogels.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Shuvanon Razik | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 3/12/2017 | +| GitHub username | shuvanon | +| Website (optional) | | From 460094bf092714216273e46c33f2dde8b3a73a25 Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Wed, 26 Apr 2017 18:27:55 +0200 Subject: [PATCH 063/111] Update __init__.py --- spacy/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 19cc61c06..ce68dfc1a 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,8 +4,6 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name from .cli.info import info -<<<<<<< HEAD - from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb From 24c4c51f1340667a358e3a3b7e567100d1041db0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Apr 2017 18:42:06 +0200 Subject: [PATCH 064/111] Try to make test999 less flakey --- spacy/tests/regression/test_issue999.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/spacy/tests/regression/test_issue999.py b/spacy/tests/regression/test_issue999.py index 0886acbaf..d0c861cb1 100644 --- a/spacy/tests/regression/test_issue999.py +++ b/spacy/tests/regression/test_issue999.py @@ -1,5 +1,4 @@ from __future__ import unicode_literals -import json import os import random import contextlib @@ -12,7 +11,7 @@ from pathlib import Path import pathlib from ...gold import GoldParse from ...pipeline import EntityRecognizer -from ...en import English +from ...language import Language try: unicode @@ -51,8 +50,8 @@ def test_issue999(train_data): 2) There's no way to set the learning rate for the weight update, so we end up out-of-scale, causing it to learn too fast. ''' - nlp = English(entity=False) - nlp.entity = EntityRecognizer(nlp.vocab, features=English.Defaults.entity_features) + nlp = Language(path=None, entity=False, tagger=False, parser=False) + nlp.entity = EntityRecognizer(nlp.vocab, features=Language.Defaults.entity_features) for _, offsets in train_data: for start, end, ent_type in offsets: nlp.entity.add_label(ent_type) @@ -65,7 +64,7 @@ def test_issue999(train_data): loss = nlp.entity.update(doc, gold) with temp_save_model(nlp) as model_dir: - nlp2 = English(path=model_dir) + nlp2 = Language(path=model_dir) for raw_text, entity_offsets in train_data: doc = nlp2(raw_text) From 4d98511db7aa87622dfb285058ee71aad5b71f18 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Apr 2017 19:01:05 +0200 Subject: [PATCH 065/111] Make Span hashable. Closes #1019 --- spacy/tests/spans/test_span.py | 12 ++++++++++++ spacy/tokens/span.pyx | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 14c176edc..d22fa52ae 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -77,3 +77,15 @@ def test_spans_override_sentiment(en_tokenizer): assert doc[:2].sentiment == 10.0 assert doc[-2:].sentiment == 10.0 assert doc[:-1].sentiment == 10.0 + + +def test_spans_are_hashable(en_tokenizer): + """Test spans can be hashed.""" + text = "good stuff bad stuff" + tokens = en_tokenizer(text) + span1 = tokens[:2] + span2 = tokens[2:4] + assert hash(span1) != hash(span2) + span3 = tokens[0:2] + assert hash(span3) == hash(span1) + diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 7e2f62171..fb1e5c732 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -66,6 +66,10 @@ cdef class Span: elif op == 5: return self.start_char >= other.start_char + def __hash__(self): + return hash((self.doc, self.label, self.start_char, self.end_char)) + + def __len__(self): self._recalculate_indices() if self.end < self.start: From 8de59ce3b96a1a5d94e8bf72963e6d632b96564f Mon Sep 17 00:00:00 2001 From: luvogels Date: Wed, 26 Apr 2017 19:10:18 +0200 Subject: [PATCH 066/111] Added tokenizer tests --- spacy/__init__.py | 40 +------------------------------- spacy/tests/nb/__init__.py | 0 spacy/tests/nb/test_tokenizer.py | 17 ++++++++++++++ 3 files changed, 18 insertions(+), 39 deletions(-) create mode 100644 spacy/tests/nb/__init__.py create mode 100644 spacy/tests/nb/test_tokenizer.py diff --git a/spacy/__init__.py b/spacy/__init__.py index 19cc61c06..9bbbd8f3a 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name from .cli.info import info -<<<<<<< HEAD from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb @@ -18,39 +17,6 @@ _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, for _lang in _languages: util.set_lang_class(_lang.lang, _lang) -from . import en -from . import de -from . import zh -from . import es -from . import it -from . import hu -from . import fr -from . import pt -from . import nl -from . import sv -from . import fi -from . import bn -from . import he -from . import nb - -from .about import * - - -set_lang_class(en.English.lang, en.English) -set_lang_class(de.German.lang, de.German) -set_lang_class(es.Spanish.lang, es.Spanish) -set_lang_class(pt.Portuguese.lang, pt.Portuguese) -set_lang_class(fr.French.lang, fr.French) -set_lang_class(it.Italian.lang, it.Italian) -set_lang_class(hu.Hungarian.lang, hu.Hungarian) -set_lang_class(zh.Chinese.lang, zh.Chinese) -set_lang_class(nl.Dutch.lang, nl.Dutch) -set_lang_class(sv.Swedish.lang, sv.Swedish) -set_lang_class(fi.Finnish.lang, fi.Finnish) -set_lang_class(bn.Bengali.lang, bn.Bengali) -set_lang_class(he.Hebrew.lang, he.Hebrew) -set_lang_class(nb.Norwegian.lang, nb.Norwegian) - def load(name, **overrides): if overrides.get('path') in (None, False, True): @@ -72,8 +38,4 @@ def load(name, **overrides): cls = util.get_lang_class(lang) overrides['meta'] = meta overrides['path'] = model_path - return cls(**overrides) - - -def info(name, markdown): - info(name, markdown) + return cls(**overrides) \ No newline at end of file diff --git a/spacy/tests/nb/__init__.py b/spacy/tests/nb/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/nb/test_tokenizer.py b/spacy/tests/nb/test_tokenizer.py new file mode 100644 index 000000000..b55901339 --- /dev/null +++ b/spacy/tests/nb/test_tokenizer.py @@ -0,0 +1,17 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + + +NB_TOKEN_EXCEPTION_TESTS = [ + ('Smørsausen brukes bl.a. til fisk', ['Smørsausen', 'brukes', 'bl.a.', 'til', 'fisk']), + ('Jeg kommer først kl. 13 pga. diverse forsinkelser', ['Jeg', 'kommer', 'først', 'kl.', '13', 'pga.', 'diverse', 'forsinkelser']) +] + + +@pytest.mark.parametrize('text,expected_tokens', NB_TOKEN_EXCEPTION_TESTS) +def test_tokenizer_handles_exception_cases(nb_tokenizer, text, expected_tokens): + tokens = nb_tokenizer(text) + token_list = [token.text for token in tokens if not token.is_space] + assert expected_tokens == token_list From f0e1606d27e8083a00125269946655f50399b105 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Apr 2017 20:25:41 +0200 Subject: [PATCH 067/111] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 6498f80ee..ad4a021c2 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.8.1' +__version__ = '1.8.2' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 4eacd72bc365763810a430cbafc1ef5ec4332dcc Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 20:49:48 +0200 Subject: [PATCH 068/111] Move list of models to own file --- website/docs/usage/_models-list.jade | 27 +++++++++++++++++++++++++++ website/docs/usage/models.jade | 23 +---------------------- 2 files changed, 28 insertions(+), 22 deletions(-) create mode 100644 website/docs/usage/_models-list.jade diff --git a/website/docs/usage/_models-list.jade b/website/docs/usage/_models-list.jade new file mode 100644 index 000000000..866dc1c42 --- /dev/null +++ b/website/docs/usage/_models-list.jade @@ -0,0 +1,27 @@ +//- 💫 DOCS > USAGE > MODELS LIST + +include ../../_includes/_mixins + +p + | Model differences are mostly statistical. In general, we do expect larger + | models to be "better" and more accurate overall. Ultimately, it depends on + | your use case and requirements, and we recommend starting with the default + | models (marked with a star below). + ++aside + | Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub], + | attached to individual releases. They can be downloaded and loaded manually, + | or using spaCy's #[code download] and #[code link] commands. All models + | follow the naming convention of #[code [language]_[type]_[genre]_[size]]. + | #[br]#[br] + + +button(gh("spacy-models"), true, "primary").u-text-tag + | View model releases + ++table(["Name", "Language", "Voc", "Dep", "Ent", "Vec", "Size", "License"]) + +model-row("en_core_web_sm", "English", [1, 1, 1, 1], "50 MB", "CC BY-SA", true) + +model-row("en_core_web_md", "English", [1, 1, 1, 1], "1 GB", "CC BY-SA") + +model-row("en_depent_web_md", "English", [1, 1, 1, 0], "328 MB", "CC BY-SA") + +model-row("en_vectors_glove_md", "English", [0, 0, 0, 1], "727 MB", "CC BY-SA") + +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true) + +model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 69142b351..b25b462d8 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -33,28 +33,7 @@ p +h(2, "available") Available models -p - | Model differences are mostly statistical. In general, we do expect larger - | models to be "better" and more accurate overall. Ultimately, it depends on - | your use case and requirements, and we recommend starting with the default - | models (marked with a star below). - -+aside - | Models are now available as #[code .tar.gz] archives #[+a(gh("spacy-models")) from GitHub], - | attached to individual releases. They can be downloaded and loaded manually, - | or using spaCy's #[code download] and #[code link] commands. All models - | follow the naming convention of #[code [language]_[type]_[genre]_[size]]. - | #[br]#[br] - - +button(gh("spacy-models"), true, "primary").u-text-tag - | View model releases - -+table(["Name", "Language", "Voc", "Dep", "Ent", "Vec", "Size", "License"]) - +model-row("en_core_web_sm", "English", [1, 1, 1, 1], "50 MB", "CC BY-SA", true) - +model-row("en_core_web_md", "English", [1, 1, 1, 1], "1 GB", "CC BY-SA") - +model-row("en_depent_web_md", "English", [1, 1, 1, 0], "328 MB", "CC BY-SA") - +model-row("en_vectors_glove_md", "English", [0, 0, 0, 1], "727 MB", "CC BY-SA") - +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true) +include _models-list +h(2, "download") Downloading models From 375edf0bb566062f0a86b582ee9f98b6f57f1c5a Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 20:50:02 +0200 Subject: [PATCH 069/111] Add list of models and include French --- website/docs/api/language-models.jade | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 083a501be..6a5ed9a9c 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -7,6 +7,7 @@ p spaCy currently supports the following languages and capabilities: +aside-code("Download language models", "bash"). python -m spacy download en python -m spacy download de + python -m spacy download fr +table([ "Language", "Token", "SBD", "Lemma", "POS", "NER", "Dep", "Vector", "Sentiment"]) +row @@ -19,6 +20,14 @@ p spaCy currently supports the following languages and capabilities: each icon in [ "pro", "pro", "con", "pro", "pro", "pro", "pro", "con" ] +cell.u-text-center #[+procon(icon)] + +row + +cell French #[code de] + each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] + +cell.u-text-center #[+procon(icon)] + ++h(2, "available") Available models + +include ../usage/_models-list +h(2, "alpha-support") Alpha support @@ -27,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", fr: "French", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell From 05bcd61fcff932194e1f7af78c003155f0f58727 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 20:51:38 +0200 Subject: [PATCH 070/111] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 3979b2e37..1045e919d 100644 --- a/README.rst +++ b/README.rst @@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.8.2`_ ``2017-04-26`` French model and small improvements `v1.8.1`_ ``2017-04-23`` Saving, loading and training bug fixes `v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading `v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands @@ -352,6 +353,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.8.2: https://github.com/explosion/spaCy/releases/tag/v1.8.2 .. _v1.8.1: https://github.com/explosion/spaCy/releases/tag/v1.8.1 .. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 From 100846bed332d9f1baff0426adf7e8426ae21ef9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 26 Apr 2017 21:40:17 +0200 Subject: [PATCH 071/111] Fix typo in model list --- website/docs/usage/_models-list.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/_models-list.jade b/website/docs/usage/_models-list.jade index 866dc1c42..942de28c4 100644 --- a/website/docs/usage/_models-list.jade +++ b/website/docs/usage/_models-list.jade @@ -22,6 +22,6 @@ p +model-row("en_core_web_sm", "English", [1, 1, 1, 1], "50 MB", "CC BY-SA", true) +model-row("en_core_web_md", "English", [1, 1, 1, 1], "1 GB", "CC BY-SA") +model-row("en_depent_web_md", "English", [1, 1, 1, 0], "328 MB", "CC BY-SA") - +model-row("en_vectors_glove_md", "English", [0, 0, 0, 1], "727 MB", "CC BY-SA") + +model-row("en_vectors_glove_md", "English", [1, 0, 0, 1], "727 MB", "CC BY-SA") +model-row("de_core_news_md", "German", [1, 1, 1, 1], "645 MB", "CC BY-SA", true, true) +model-row("fr_depvec_web_lg", "French", [1, 1, 0, 1], "1.33 GB", "CC BY-NC", true, true) From d12a0b643146e58b6b76dbb3514dc2aa62eb0f41 Mon Sep 17 00:00:00 2001 From: luvogels Date: Wed, 26 Apr 2017 23:21:41 +0200 Subject: [PATCH 072/111] Hooked up tokenizer tests --- spacy/tests/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 37d3180d0..b8ada1d9a 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -13,6 +13,8 @@ from ..hu import Hungarian from ..fi import Finnish from ..bn import Bengali from ..he import Hebrew +from ..nb import Norwegian + from ..tokens import Doc from ..strings import StringStore @@ -26,7 +28,7 @@ import pytest LANGUAGES = [English, German, Spanish, Italian, French, Portuguese, Dutch, - Swedish, Hungarian, Finnish, Bengali] + Swedish, Hungarian, Finnish, Bengali, Norwegian] @pytest.fixture(params=LANGUAGES) @@ -88,6 +90,9 @@ def bn_tokenizer(): def he_tokenizer(): return Hebrew.Defaults.create_tokenizer() +@pytest.fixture +def nb_tokenizer(): + return Norwegian.Defaults.create_tokenizer() @pytest.fixture def stringstore(): From e136c51393e8f87b9bf9a4bbf1ea69da604e85a2 Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Wed, 26 Apr 2017 23:24:11 +0200 Subject: [PATCH 073/111] =?UTF-8?q?Update=20Alpha=5Fsupport=5FNorwegian=20?= =?UTF-8?q?bokm=C3=A5l.md?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .github/Alpha_support_Norwegian bokmål.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/Alpha_support_Norwegian bokmål.md b/.github/Alpha_support_Norwegian bokmål.md index 0917810ab..a3752db42 100644 --- a/.github/Alpha_support_Norwegian bokmål.md +++ b/.github/Alpha_support_Norwegian bokmål.md @@ -16,5 +16,5 @@ Added alpha support for Norwegian bokmål. - [ ] My change requires a change to spaCy's documentation. - [ ] I have updated the documentation accordingly. -- [ ] I have added tests to cover my changes. -- [ ] All new and existing tests passed. +- [x] I have added tests to cover my changes. +- [x] All new and existing tests passed. From 13ce4c96b1bfe2f484423e02afb7a49463b650a0 Mon Sep 17 00:00:00 2001 From: Leif Uwe Vogelsang Date: Thu, 27 Apr 2017 10:42:07 +0200 Subject: [PATCH 074/111] Update luvogels.md --- .github/contributors/luvogels.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/contributors/luvogels.md b/.github/contributors/luvogels.md index c915d48bf..c967c1cd2 100644 --- a/.github/contributors/luvogels.md +++ b/.github/contributors/luvogels.md @@ -98,9 +98,9 @@ mark both statements: | Field | Entry | |------------------------------- | -------------------- | -| Name | Shuvanon Razik | +| Name | Leif Uwe Vogelsang | | Company name (if applicable) | | | Title or role (if applicable) | | -| Date | 3/12/2017 | -| GitHub username | shuvanon | +| Date | 4/27/2017 | +| GitHub username | luvogels | | Website (optional) | | From 03d2b0cc058803b373c7fadd1c95e57ad24bf6dc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:14:26 +0200 Subject: [PATCH 075/111] Add newline --- spacy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 3106b49c5..f71d3addd 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -37,4 +37,4 @@ def load(name, **overrides): cls = util.get_lang_class(lang) overrides['meta'] = meta overrides['path'] = model_path - return cls(**overrides) \ No newline at end of file + return cls(**overrides) From ccf13ecc21ffd39ef226044c9218ccbca2c221ec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:14:42 +0200 Subject: [PATCH 076/111] Add newline --- spacy/nb/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nb/language_data.py b/spacy/nb/language_data.py index 9383f3a62..248b09fc7 100644 --- a/spacy/nb/language_data.py +++ b/spacy/nb/language_data.py @@ -25,4 +25,4 @@ update_exc(TOKENIZER_EXCEPTIONS, expand_exc(TOKENIZER_EXCEPTIONS, "'", "’")) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) # export -__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"] \ No newline at end of file +__all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS", "MORPH_RULES"] From 4cd9269aef8a4a16331991c013f45e6d3bd1ddd9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:15:04 +0200 Subject: [PATCH 077/111] Add newline --- spacy/nb/morph_rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nb/morph_rules.py b/spacy/nb/morph_rules.py index 1f9f1b84e..38498513b 100644 --- a/spacy/nb/morph_rules.py +++ b/spacy/nb/morph_rules.py @@ -64,4 +64,4 @@ MORPH_RULES = { "var": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Sing"}, "vært": {LEMMA: "be", "VerbForm": "Fin", "Tense": "Past", "Number": "Plur"} } -} \ No newline at end of file +} From 5942adccc278a710944d14db134f2a40cc8ed841 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:15:19 +0200 Subject: [PATCH 078/111] Add newline --- spacy/nb/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nb/stop_words.py b/spacy/nb/stop_words.py index 56e0ef3bc..721ba2e47 100644 --- a/spacy/nb/stop_words.py +++ b/spacy/nb/stop_words.py @@ -46,4 +46,4 @@ vant var ved veldig vi videre viktig vil ville viser vår være vært ønsker -""".split()) \ No newline at end of file +""".split()) From c9e592ae6c4d68c3d0bc02539ac7de500688bd74 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:15:41 +0200 Subject: [PATCH 079/111] Add newline --- spacy/nb/tokenizer_exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/nb/tokenizer_exceptions.py b/spacy/nb/tokenizer_exceptions.py index 44fc76532..ea7658c87 100644 --- a/spacy/nb/tokenizer_exceptions.py +++ b/spacy/nb/tokenizer_exceptions.py @@ -172,4 +172,4 @@ ORTH_ONLY = ["adm.dir.", "vsa.", "årg.", "årh." -] \ No newline at end of file +] From 6930ed719d6b28ed95d8502926be70f7ae8384f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:17:06 +0200 Subject: [PATCH 080/111] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ce4ca62db..69a562e48 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -29,6 +29,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) * Kendrick Tan, [@kendricktan](https://github.com/kendricktan) * Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson) +* Leif Uwe Vogelsang, [@luvogels](https://github.com/luvogels) * Liling Tan, [@alvations](https://github.com/alvations) * Magnus Burton, [@magnusburton](https://github.com/magnusburton) * Mark Amery, [@ExplodingCabbage](https://github.com/ExplodingCabbage) From bc88f9865e574d842e45b8d27fe89bca16b8763d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:17:30 +0200 Subject: [PATCH 081/111] Remove file (already covered in PR) --- .github/Alpha_support_Norwegian bokmål.md | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 .github/Alpha_support_Norwegian bokmål.md diff --git a/.github/Alpha_support_Norwegian bokmål.md b/.github/Alpha_support_Norwegian bokmål.md deleted file mode 100644 index a3752db42..000000000 --- a/.github/Alpha_support_Norwegian bokmål.md +++ /dev/null @@ -1,20 +0,0 @@ - - -## Description - - -Added alpha support for Norwegian bokmål. - -## Types of changes - -- [ ] **Bug fix** (non-breaking change fixing an issue) -- [x] **New feature** (non-breaking change adding functionality to spaCy) -- [ ] **Breaking change** (fix or feature causing change to spaCy's existing functionality) -- [ ] **Documentation** (addition to documentation of spaCy) - -## Checklist: - -- [ ] My change requires a change to spaCy's documentation. -- [ ] I have updated the documentation accordingly. -- [x] I have added tests to cover my changes. -- [x] All new and existing tests passed. From 2f918e3004879b430b52afa501936b7a255f37aa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:18:41 +0200 Subject: [PATCH 082/111] Update README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 1045e919d..baf0654e6 100644 --- a/README.rst +++ b/README.rst @@ -5,8 +5,8 @@ spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports English and German, as well as tokenization for Chinese, Spanish, Italian, French, -Portuguese, Dutch, Swedish, Finnish, Hungarian, Bengali and Hebrew. It's commercial -open-source software, released under the MIT license. +Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's +commercial open-source software, released under the MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. From 034ec5710b01feed0ac782dc975d0eecb31990c6 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 27 Apr 2017 11:24:18 +0200 Subject: [PATCH 083/111] Fix typo and add Norwegian to alpha languages --- website/docs/api/language-models.jade | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 6a5ed9a9c..a2ad9b9eb 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -21,7 +21,7 @@ p spaCy currently supports the following languages and capabilities: +cell.u-text-center #[+procon(icon)] +row - +cell French #[code de] + +cell French #[code fr] each icon in [ "pro", "pro", "con", "pro", "con", "pro", "pro", "con" ] +cell.u-text-center #[+procon(icon)] @@ -36,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell From 7a894c9ef0e693ef78c3979624fa2fec4238d14c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 11:25:30 +0200 Subject: [PATCH 084/111] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index baf0654e6..9b8438ce8 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports -English and German, as well as tokenization for Chinese, Spanish, Italian, French, +English, German and French, as well as tokenization for Chinese, Spanish, Italian, Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's commercial open-source software, released under the MIT license. From 5aa49971f9693f6b26379c9effcae278a044226a Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 27 Apr 2017 12:08:45 +0200 Subject: [PATCH 085/111] Add French example to models docs --- website/docs/usage/models.jade | 1 + 1 file changed, 1 insertion(+) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index b25b462d8..9bb75ba9a 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -61,6 +61,7 @@ p # out-of-the-box: download best-matching default model python -m spacy download en python -m spacy download de + python -m spacy download fr # download best-matching version of specific model for your spaCy installation python -m spacy download en_core_web_md From c9f9203d5f8ac535d3f5104d8a9883c441eb886b Mon Sep 17 00:00:00 2001 From: "M. Z. Ferdous (Imran)" <1205081.mzfs@ugrad.cse.buet.ac.bd> Date: Thu, 27 Apr 2017 16:48:54 +0600 Subject: [PATCH 086/111] fix typo, CONLL format tried to google about connlu format. Saw there is conll format, not connlu. --- website/docs/usage/adding-languages.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 67ac8d610..03a1eae43 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -544,7 +544,7 @@ p p | You can now train the model using a corpus for your language annotated | with #[+a("http://universaldependencies.org/") Universal Dependencies]. - | If your corpus uses the connlu format, you can use the + | If your corpus uses the CONLL format, you can use the | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. From 2da16adcc289c6999acbad57c47845037101b383 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Apr 2017 13:18:39 +0200 Subject: [PATCH 087/111] Add dropout optin for parser and NER Dropout can now be specified in the `Parser.update()` method via the `drop` keyword argument, e.g. nlp.entity.update(doc, gold, drop=0.4) This will randomly drop 40% of features, and multiply the value of the others by 1. / 0.4. This may be useful for generalising from small data sets. This commit also patches the examples/training/train_new_entity_type.py example, to use dropout and fix the output (previously it did not output the learned entity). --- examples/training/train_new_entity_type.py | 36 +++++++++++++++++++--- spacy/syntax/parser.pyx | 18 ++++++++++- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 1e9505d44..4eae11c75 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# coding: utf8 """ Example of training an additional entity type @@ -26,11 +27,11 @@ For more details, see the documentation: Developed for: spaCy 1.7.6 Last tested for: spaCy 1.7.6 """ -# coding: utf8 from __future__ import unicode_literals, print_function import random from pathlib import Path +import random import spacy from spacy.gold import GoldParse @@ -43,14 +44,35 @@ def train_ner(nlp, train_data, output_dir): doc = nlp.make_doc(raw_text) for word in doc: _ = nlp.vocab[word.orth] - - for itn in range(20): + random.seed(0) + # You may need to change the learning rate. It's generally difficult to + # guess what rate you should set, especially when you have limited data. + nlp.entity.model.learn_rate = 0.001 + for itn in range(1000): random.shuffle(train_data) + loss = 0. for raw_text, entity_offsets in train_data: gold = GoldParse(doc, entities=entity_offsets) + # By default, the GoldParse class assumes that the entities + # described by offset are complete, and all other words should + # have the tag 'O'. You can tell it to make no assumptions + # about the tag of a word by giving it the tag '-'. + # However, this allows a trivial solution to the current + # learning problem: if words are either 'any tag' or 'ANIMAL', + # the model can learn that all words can be tagged 'ANIMAL'. + #for i in range(len(gold.ner)): + #if not gold.ner[i].endswith('ANIMAL'): + # gold.ner[i] = '-' doc = nlp.make_doc(raw_text) nlp.tagger(doc) - loss = nlp.entity.update(doc, gold) + # As of 1.9, spaCy's parser now lets you supply a dropout probability + # This might help the model generalize better from only a few + # examples. + loss += nlp.entity.update(doc, gold, drop=0.9) + if loss == 0: + break + # This step averages the model's weights. This may or may not be good for + # your situation --- it's empirical. nlp.end_training() if output_dir: if not output_dir.exists(): @@ -80,13 +102,19 @@ def main(model_name, output_directory=None): ( "they pretend to care about your feelings, those horses", [(48, 54, 'ANIMAL')] + ), + ( + "horses?", + [(0, 6, 'ANIMAL')] ) + ] nlp.entity.add_label('ANIMAL') train_ner(nlp, train_data, output_directory) # Test that the entity is recognized doc = nlp('Do you like horses?') + print("Ents in 'Do you like horses?':") for ent in doc.ents: print(ent.label_, ent.text) if output_directory: diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 57606dc76..b9de1e114 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -11,6 +11,8 @@ import ujson cimport cython cimport cython.parallel +import numpy.random + from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals from libc.stdint cimport uint32_t, uint64_t @@ -303,7 +305,7 @@ cdef class Parser: free(eg.is_valid) return 0 - def update(self, Doc tokens, GoldParse gold, itn=0): + def update(self, Doc tokens, GoldParse gold, itn=0, double drop=0.0): """ Update the statistical model. @@ -325,9 +327,11 @@ cdef class Parser: nr_feat=self.model.nr_feat) cdef weight_t loss = 0 cdef Transition action + cdef double dropout_rate = self.cfg.get('dropout', drop) while not stcls.is_final(): eg.c.nr_feat = self.model.set_featuresC(eg.c.atoms, eg.c.features, stcls.c) + dropout(eg.c.features, eg.c.nr_feat, dropout_rate) self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) @@ -378,6 +382,18 @@ cdef class Parser: self.cfg.setdefault('extra_labels', []).append(label) +cdef int dropout(FeatureC* feats, int nr_feat, float prob) except -1: + if prob <= 0 or prob >= 1.: + return 0 + cdef double[::1] py_probs = numpy.random.uniform(0., 1., nr_feat) + cdef double* probs = &py_probs[0] + for i in range(nr_feat): + if probs[i] >= prob: + feats[i].value /= prob + else: + feats[i].value = 0. + + cdef class StepwiseState: cdef readonly StateClass stcls cdef readonly Example eg From fb96f88b59bfafc77749268c2bd34b6dc65e5a5d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 27 Apr 2017 14:36:08 +0200 Subject: [PATCH 088/111] Update info on CoNLL format and include link --- website/docs/usage/adding-languages.jade | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index 03a1eae43..30c4486b0 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -544,7 +544,9 @@ p p | You can now train the model using a corpus for your language annotated | with #[+a("http://universaldependencies.org/") Universal Dependencies]. - | If your corpus uses the CONLL format, you can use the + | If your corpus uses the + | #[+a("http://universaldependencies.org/docs/format.html") CoNLL-U] format, + | i.e. files with the extension #[code .conllu], you can use the | #[+a("/docs/usage/cli#convert") #[code convert] command] to convert it to | spaCy's #[+a("/docs/api/annotation#json-input") JSON format] for training. From c8f83aeb873c2d3beff22cbe0f967b6d56b6793e Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:21 +0900 Subject: [PATCH 089/111] Add basic japanese support --- setup.py | 3 ++- spacy/__init__.py | 4 ++-- spacy/ja/__init__.py | 19 +++++++++++++++++++ spacy/ja/language_data.py | 23 +++++++++++++++++++++++ spacy/ja/stop_words.py | 9 +++++++++ spacy/ja/tag_map.py | 24 ++++++++++++++++++++++++ 6 files changed, 79 insertions(+), 3 deletions(-) create mode 100644 spacy/ja/__init__.py create mode 100644 spacy/ja/language_data.py create mode 100644 spacy/ja/stop_words.py create mode 100644 spacy/ja/tag_map.py diff --git a/setup.py b/setup.py index 1f13747dc..52ce06843 100755 --- a/setup.py +++ b/setup.py @@ -36,7 +36,8 @@ PACKAGES = [ 'spacy.fi', 'spacy.bn', 'spacy.he', - 'spacy.nb', + 'spacy.nb', + 'spacy.ja', 'spacy.en.lemmatizer', 'spacy.cli.converters', 'spacy.language_data', diff --git a/spacy/__init__.py b/spacy/__init__.py index f71d3addd..f5912e13e 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -5,12 +5,12 @@ from . import util from .deprecated import resolve_model_name from .cli.info import info -from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja _languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, - fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian) + fi.Finnish, bn.Bengali, he.Hebrew, nb.Norwegian, ja.Japanese) for _lang in _languages: diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py new file mode 100644 index 000000000..f9ab7b560 --- /dev/null +++ b/spacy/ja/__init__.py @@ -0,0 +1,19 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language +from ..attrs import LANG +from ..tokens import Doc + +from .language_data import * + + +class Japanese(Language): + lang = 'ja' + + def make_doc(self, text): + from janome.tokenizer import Tokenizer + words = [x.surface for x in Tokenizer().tokenize(text)] + return Doc(self.vocab, words=words, spaces=[False]*len(words)) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py new file mode 100644 index 000000000..2e8dfbafb --- /dev/null +++ b/spacy/ja/language_data.py @@ -0,0 +1,23 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# import base language data +from .. import language_data as base + + +# import util functions +from ..language_data import update_exc, strings_to_exc + + +# import language-specific data from files +from .tag_map import TAG_MAP +from .stop_words import STOP_WORDS + + +TAG_MAP = dict(TAG_MAP) +STOP_WORDS = set(STOP_WORDS) + + +# export +__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py new file mode 100644 index 000000000..b2120b30d --- /dev/null +++ b/spacy/ja/stop_words.py @@ -0,0 +1,9 @@ +# encoding: utf8 +from __future__ import unicode_literals + + +# stop words as whitespace-separated list +STOP_WORDS = set(""" +。 +、 +""".split()) \ No newline at end of file diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py new file mode 100644 index 000000000..2196ff397 --- /dev/null +++ b/spacy/ja/tag_map.py @@ -0,0 +1,24 @@ +# encoding: utf8 +from __future__ import unicode_literals + +from ..symbols import * + + +TAG_MAP = { + "ADV": {POS: ADV}, + "NOUN": {POS: NOUN}, + "ADP": {POS: ADP}, + "PRON": {POS: PRON}, + "SCONJ": {POS: SCONJ}, + "PROPN": {POS: PROPN}, + "DET": {POS: DET}, + "SYM": {POS: SYM}, + "INTJ": {POS: INTJ}, + "PUNCT": {POS: PUNCT}, + "NUM": {POS: NUM}, + "AUX": {POS: AUX}, + "X": {POS: X}, + "CONJ": {POS: CONJ}, + "ADJ": {POS: ADJ}, + "VERB": {POS: VERB} +} \ No newline at end of file From 0e7a9b9facdcdc24f5064070971653f8a75e51ad Mon Sep 17 00:00:00 2001 From: Yasuaki Uechi Date: Wed, 3 May 2017 13:56:45 +0900 Subject: [PATCH 090/111] =?UTF-8?q?Add=20Japanese=20to=20'Alpha=20support?= =?UTF-8?q?=E2=80=99=20section?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- website/docs/api/language-models.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index a2ad9b9eb..40105b85c 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -36,7 +36,7 @@ p | the existing language data and extending the tokenization patterns. +table([ "Language", "Source" ]) - each language, code in { zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } +row +cell #{language} #[code=code] +cell From 8676cd013593444324f101af2f3c0b8c680777bc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:07 +0200 Subject: [PATCH 091/111] Add newline --- spacy/ja/language_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/language_data.py b/spacy/ja/language_data.py index 2e8dfbafb..007ed2b4e 100644 --- a/spacy/ja/language_data.py +++ b/spacy/ja/language_data.py @@ -20,4 +20,4 @@ STOP_WORDS = set(STOP_WORDS) # export -__all__ = ["TAG_MAP", "STOP_WORDS"] \ No newline at end of file +__all__ = ["TAG_MAP", "STOP_WORDS"] From d12ca587eababb75601078c4761e6a9d78fefecc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:29 +0200 Subject: [PATCH 092/111] Add newline --- spacy/ja/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/stop_words.py b/spacy/ja/stop_words.py index b2120b30d..45bb7a4d8 100644 --- a/spacy/ja/stop_words.py +++ b/spacy/ja/stop_words.py @@ -6,4 +6,4 @@ from __future__ import unicode_literals STOP_WORDS = set(""" 。 、 -""".split()) \ No newline at end of file +""".split()) From 949ad6594b759ebd91da142187cbb6f675117eea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:38:43 +0200 Subject: [PATCH 093/111] Add newline --- spacy/ja/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/ja/tag_map.py b/spacy/ja/tag_map.py index 2196ff397..f5b6b5040 100644 --- a/spacy/ja/tag_map.py +++ b/spacy/ja/tag_map.py @@ -21,4 +21,4 @@ TAG_MAP = { "CONJ": {POS: CONJ}, "ADJ": {POS: ADJ}, "VERB": {POS: VERB} -} \ No newline at end of file +} From d730eb0c0df2fb6784f7adcce479c4c9588764b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:43:29 +0200 Subject: [PATCH 094/111] Raise custom ImportError if importing janome fails --- spacy/ja/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index f9ab7b560..2915d6330 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -14,6 +14,9 @@ class Japanese(Language): lang = 'ja' def make_doc(self, text): - from janome.tokenizer import Tokenizer + try: + from janome.tokenizer import Tokenizer + except ImportError: + raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From 3ea23a3f4db561f800a21bed9b25ced648b826d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 09:44:38 +0200 Subject: [PATCH 095/111] Fix formatting --- spacy/ja/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/ja/__init__.py b/spacy/ja/__init__.py index 2915d6330..07e40ada6 100644 --- a/spacy/ja/__init__.py +++ b/spacy/ja/__init__.py @@ -17,6 +17,7 @@ class Japanese(Language): try: from janome.tokenizer import Tokenizer except ImportError: - raise ImportError("The Japanese tokenizer requires the Janome library: https://github.com/mocobeta/janome") + raise ImportError("The Japanese tokenizer requires the Janome library: " + "https://github.com/mocobeta/janome") words = [x.surface for x in Tokenizer().tokenize(text)] return Doc(self.vocab, words=words, spaces=[False]*len(words)) From f9384b0fbd5a555d688b353f2847d4ca32242a76 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 09:58:31 +0200 Subject: [PATCH 096/111] Update alpha languages and add aside for tokenizer dependencies --- website/docs/api/language-models.jade | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/website/docs/api/language-models.jade b/website/docs/api/language-models.jade index 40105b85c..3bce7272f 100644 --- a/website/docs/api/language-models.jade +++ b/website/docs/api/language-models.jade @@ -35,14 +35,15 @@ p | Work has started on the following languages. You can help by improving | the existing language data and extending the tokenization patterns. ++aside("Dependencies") + | Some language tokenizers require external dependencies. To use #[strong Chinese], + | you need to have #[+a("https://github.com/fxsjy/jieba") Jieba] installed. + | The #[strong Japanese] tokenizer requires + | #[+a("https://github.com/mocobeta/janome") Janome]. + +table([ "Language", "Source" ]) - each language, code in { ja: "Japanese", zh: "Chinese", es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew" } + each language, code in { es: "Spanish", it: "Italian", pt: "Portuguese", nl: "Dutch", sv: "Swedish", fi: "Finnish", nb: "Norwegian Bokmål", hu: "Hungarian", bn: "Bengali", he: "Hebrew", zh: "Chinese", ja: "Japanese" } +row +cell #{language} #[code=code] +cell +src(gh("spaCy", "spacy/" + code)) spacy/#{code} - -p - | Chinese tokenization requires the - | #[+a("https://github.com/fxsjy/jieba") Jieba] library. Statistical - | models are coming soon. From e2380d87891a2591790f5873ad44a028a06f8540 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 10:00:04 +0200 Subject: [PATCH 097/111] Update README.rst --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9b8438ce8..24b0c232a 100644 --- a/README.rst +++ b/README.rst @@ -4,9 +4,10 @@ spaCy: Industrial-strength NLP spaCy is a library for advanced natural language processing in Python and Cython. spaCy is built on the very latest research, but it isn't researchware. It was designed from day one to be used in real products. spaCy currently supports -English, German and French, as well as tokenization for Chinese, Spanish, Italian, -Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali and Hebrew. It's -commercial open-source software, released under the MIT license. +English, German and French, as well as tokenization for Spanish, Italian, +Portuguese, Dutch, Swedish, Finnish, Norwegian, Hungarian, Bengali, Hebrew, +Chinese and Japanese. It's commercial open-source software, released under the +MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. From 6e1fad92a1c26ddf1f73a31b7b09f2e7f7cac093 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 3 May 2017 10:01:40 +0200 Subject: [PATCH 098/111] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 69a562e48..b64dc8db3 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -52,4 +52,5 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Willem van Hage, [@wrvhage](https://github.com/wrvhage) * Wolfgang Seeker, [@wbwseeker](https://github.com/wbwseeker) * Yanhao Yang, [@YanhaoYang](https://github.com/YanhaoYang) +* Yasuaki Uechi, [@uetchy](https://github.com/uetchy) * Yubing Dong, [@tomtung](https://github.com/tomtung) From c158cdb1dade90ff92892bf86521e6d164916b25 Mon Sep 17 00:00:00 2001 From: akYoung Date: Wed, 3 May 2017 22:41:23 +0800 Subject: [PATCH 099/111] Corretions for model test example The sentences of test data in sentence entailment example should be generated with integers limited to vocab_size. --- .../keras_parikh_entailment/keras_decomposable_attention.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/keras_parikh_entailment/keras_decomposable_attention.py b/examples/keras_parikh_entailment/keras_decomposable_attention.py index c8aaffd25..f9f912501 100644 --- a/examples/keras_parikh_entailment/keras_decomposable_attention.py +++ b/examples/keras_parikh_entailment/keras_decomposable_attention.py @@ -256,9 +256,9 @@ def test_fit_model(): settings = {'lr': 0.001, 'dropout': 0.2, 'gru_encode':True} model = build_model(vectors, shape, settings) - train_X = _generate_X(20, shape[0], vectors.shape[1]) + train_X = _generate_X(20, shape[0], vectors.shape[0]) train_Y = _generate_Y(20, shape[2]) - dev_X = _generate_X(15, shape[0], vectors.shape[1]) + dev_X = _generate_X(15, shape[0], vectors.shape[0]) dev_Y = _generate_Y(15, shape[2]) model.fit(train_X, train_Y, validation_data=(dev_X, dev_Y), nb_epoch=5, From a04b5be1b2b4cc01c1c962294077a842ab1d0e59 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 17:01:53 +0200 Subject: [PATCH 100/111] Add glossary for annotation scheme (closes #1034) Can be imported as explain from spacy.glossary, or called as spacy.explain(term) --- spacy/__init__.py | 1 + spacy/glossary.py | 294 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 spacy/glossary.py diff --git a/spacy/__init__.py b/spacy/__init__.py index f5912e13e..2308ce7e4 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from . import util from .deprecated import resolve_model_name from .cli.info import info +from .glossary import explain from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he, nb, ja diff --git a/spacy/glossary.py b/spacy/glossary.py new file mode 100644 index 000000000..9df26a6af --- /dev/null +++ b/spacy/glossary.py @@ -0,0 +1,294 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def explain(term): + if term in GLOSSARY: + return GLOSSARY[term] + + +GLOSSARY = { + # POS tags + # Universal POS Tags + # http://universaldependencies.org/u/pos/ + + 'ADJ': 'adjective', + 'ADP': 'adposition', + 'ADV': 'adverb', + 'AUX': 'auxiliary', + 'CONJ': 'conjunction', + 'CCONJ': 'coordinating conjunction', + 'DET': 'determiner', + 'INTJ': 'interjection', + 'NOUN': 'noun', + 'NUM': 'numeral', + 'PART': 'particle', + 'PRON': 'pronoun', + 'PROPN': 'proper noun', + 'PUNCT': 'punctuation', + 'SCONJ': 'subordinating conjunction', + 'SYM': 'symbol', + 'VERB': 'verb', + 'X': 'other', + 'EOL': 'end of line', + 'SPACE': 'space', + + + # POS tags (English) + # OntoNotes 5 / Penn Treebank + # https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html + + '.': 'punctuation mark, sentence closer', + ',': 'punctuation mark, comma', + '-LRB-': 'left round bracket', + '-RRB-': 'right round bracket', + '``': 'opening quotation mark', + '""': 'closing quotation mark', + "''": 'closing quotation mark', + ':': 'punctuation mark, colon or ellipsis', + '$': 'symbol, currency', + '#': 'symbol, number sign', + 'AFX': 'affix', + 'CC': 'conjunction, coordinating', + 'CD': 'cardinal number', + 'DT': 'determiner', + 'EX': 'existential there', + 'FW': 'foreign word', + 'HYPH': 'punctuation mark, hyphen', + 'IN': 'conjunction, subordinating or preposition', + 'JJ': 'adjective', + 'JJR': 'adjective, comparative', + 'JJS': 'adjective, superlative', + 'LS': 'list item marker', + 'MD': 'verb, modal auxillary', + 'NIL': 'missing tag', + 'NN': 'noun, singular or mass', + 'NNP': 'noun, proper singular', + 'NNPS': 'noun, proper plural', + 'NNS': 'noun, plural', + 'PDT': 'predeterminer', + 'POS': 'possessive ending', + 'PRP': 'pronoun, personal', + 'PRP$': 'pronoun, possessive', + 'RB': 'adverb', + 'RBR': 'adverb, comparative', + 'RBS': 'adverb, superlative', + 'RP': 'adverb, particle', + 'TO': 'infinitival to', + 'UH': 'interjection', + 'VB': 'verb, base form', + 'VBD': 'verb, past tense', + 'VBG': 'verb, gerund or present participle', + 'VBN': 'verb, past participle', + 'VBP': 'verb, non-3rd person singular present', + 'VBZ': 'verb, 3rd person singular present', + 'WDT': 'wh-determiner', + 'WP': 'wh-pronoun, personal', + 'WP$': 'wh-pronoun, possessive', + 'WRB': 'wh-adverb', + 'SP': 'space', + 'ADD': 'email', + 'NFP': 'superfluous punctuation', + 'GW': 'additional word in multi-word expression', + 'XX': 'unknown', + 'BES': 'auxillary "be"', + 'HVS': 'forms of "have"', + + + # POS Tags (German) + # TIGER Treebank + # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf + + '$(': 'other sentence-internal punctuation mark', + '$,': 'comma', + '$.': 'sentence-final punctuation mark', + 'ADJA': 'adjective, attributive', + 'ADJD': 'adjective, adverbial or predicative', + 'APPO': 'postposition', + 'APRP': 'preposition; circumposition left', + 'APPRART': 'preposition with article', + 'APZR': 'circumposition right', + 'ART': 'definite or indefinite article', + 'CARD': 'cardinal number', + 'FM': 'foreign language material', + 'ITJ': 'interjection', + 'KOKOM': 'comparative conjunction ', + 'KON': 'coordinate conjunction', + 'KOUI': 'subordinate conjunction with "zu" and infinitive', + 'KOUS': 'subordinate conjunction with sentence', + 'NE': 'proper noun', + 'NNE': 'proper noun', + 'PAV': 'pronominal adverb', + 'PROAV': 'pronominal adverb', + 'PDAT': 'attributive demonstrative pronoun', + 'PDS': 'substituting demonstrative pronoun', + 'PIAT': 'attributive indefinite pronoun without determiner', + 'PIDAT': 'attributive indefinite pronoun with determiner', + 'PIS': 'substituting indefinite pronoun', + 'PPER': 'non-reflexive personal pronoun', + 'PPOSAT': 'attributive possessive pronoun', + 'PPOSS': 'substituting possessive pronoun', + 'PRELAT': 'attributive relative pronoun', + 'PRELS': 'substituting relative pronoun', + 'PRF': 'reflexive personal pronoun', + 'PTKA': 'particle with adjective or adverb', + 'PTKANT': 'answer particle', + 'PTKNEG': 'negative particle', + 'PTKVZ': 'separable verbal particle', + 'PTKZU': '"zu" before infinitive', + 'PWAT': 'attributive interrogative pronoun', + 'PWAV': 'adverbial interrogative or relative pronoun', + 'PWS': 'substituting interrogative pronoun', + 'TRUNC': 'word remnant', + 'VAFIN': 'finite verb, auxiliary', + 'VAIMP': 'imperative, auxiliary', + 'VAINF': 'infinitive, auxiliary', + 'VAPP': 'perfect participle, auxiliary', + 'VMFIN': 'finite verb, modal', + 'VMINF': 'infinitive, modal ', + 'VMPP': 'perfect participle, modal ', + 'VVFIN': 'finite verb, full', + 'VVIMP': 'imperative, full', + 'VVINF': 'infinitive, full', + 'VVIZU': 'infinitive with "zu", full', + 'VVPP': 'perfect participle, full', + 'XY': 'non-word containing non-letter', + + + # Noun chunks + + 'NP': 'noun phrase', + 'PP': 'prepositional phrase', + 'VP': 'verb phrase', + 'ADVP': 'adverb phrase', + 'ADJP': 'adjective phrase', + 'SBAR': 'subordinating conjunction', + 'PRT': 'particle', + 'PNP': 'prepositional noun phrase', + + + # Dependency Labels (English) + # ClearNLP / Universal Dependencies + # https://github.com/clir/clearnlp-guidelines/blob/master/md/specifications/dependency_labels.md + + 'acomp': 'adjectival complement', + 'advcl': 'adverbial clause modifier', + 'advmod': 'adverbial modifier', + 'agent': 'agent', + 'amod': 'adjectival modifier', + 'appos': 'appositional modifier', + 'attr': 'attribute', + 'aux': 'auxiliary', + 'auxpass': 'auxiliary (passive)', + 'cc': 'coordinating conjunction', + 'ccomp': 'clausal complement', + 'complm': 'complementizer', + 'conj': 'conjunct', + 'cop': 'copula', + 'csubj': 'clausal subject', + 'csubjpass': 'clausal subject (passive)', + 'dep': 'unclassified dependent', + 'det': 'determiner', + 'dobj': 'direct object', + 'expl': 'expletive', + 'hmod': 'modifier in hyphenation', + 'hyph': 'hyphen', + 'infmod': 'infinitival modifier', + 'intj': 'interjection', + 'iobj': 'indirect object', + 'mark': 'marker', + 'meta': 'meta modifier', + 'neg': 'negation modifier', + 'nmod': 'modifier of nominal', + 'nn': 'noun compound modifier', + 'npadvmod': 'noun phrase as adverbial modifier', + 'nsubj': 'nominal subject', + 'nsubjpass': 'nominal subject (passive)', + 'num': 'number modifier', + 'number': 'number compound modifier', + 'oprd': 'object predicate', + 'obj': 'object', + 'obl': 'oblique nominal', + 'parataxis': 'parataxis', + 'partmod': 'participal modifier', + 'pcomp': 'complement of preposition', + 'pobj': 'object of preposition', + 'poss': 'possession modifier', + 'possessive': 'possessive modifier', + 'preconj': 'pre-correlative conjunction', + 'prep': 'prepositional modifier', + 'prt': 'particle', + 'punct': 'punctuation', + 'quantmod': 'modifier of quantifier', + 'rcmod': 'relative clause modifier', + 'root': 'root', + 'xcomp': 'open clausal complement', + + + # Dependency labels (German) + # TIGER Treebank + # http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/tiger_introduction.pdf + # currently missing: 'cc' (comparative complement) because of conflict + # with English labels + + 'ac': 'adpositional case marker', + 'adc': 'adjective component', + 'ag': 'genitive attribute', + 'ams': 'measure argument of adjective', + 'app': 'apposition', + 'avc': 'adverbial phrase component', + 'cd': 'coordinating conjunction', + 'cj': 'conjunct', + 'cm': 'comparative conjunction', + 'cp': 'complementizer', + 'cvc': 'collocational verb construction', + 'da': 'dative', + 'dh': 'discourse-level head', + 'dm': 'discourse marker', + 'ep': 'expletive es', + 'hd': 'head', + 'ju': 'junctor', + 'mnr': 'postnominal modifier', + 'mo': 'modifier', + 'ng': 'negation', + 'nk': 'noun kernel element', + 'nmc': 'numerical component', + 'oa': 'accusative object', + 'oa': 'second accusative object', + 'oc': 'clausal object', + 'og': 'genitive object', + 'op': 'prepositional object', + 'par': 'parenthetical element', + 'pd': 'predicate', + 'pg': 'phrasal genitive', + 'ph': 'placeholder', + 'pm': 'morphological particle', + 'pnc': 'proper noun component', + 'rc': 'relative clause', + 're': 'repeated element', + 'rs': 'reported speech', + 'sb': 'subject', + + + # Named Entity Recognition + # OntoNotes 5 + # https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf + + 'PERSON': 'People, including fictional', + 'NORP': 'Nationalities or religious or political groups', + 'FACILITY': 'Buildings, airports, highways, bridges, etc.', + 'ORG': 'Companies, agencies, institutions, etc.', + 'GPE': 'Countries, cities, states', + 'LOC': 'Non-GPE locations, mountain ranges, bodies of water', + 'PRODUCT': 'Objects, vehicles, foods, etc. (not services)', + 'EVENT': 'Named hurricanes, battles, wars, sports events, etc.', + 'WORK_OF_ART': 'Titles of books, songs, etc.', + 'LANGUAGE': 'Any named language', + 'DATE': 'Absolute or relative dates or periods', + 'TIME': 'Times smaller than a day', + 'PERCENT': 'Percentage, including "%"', + 'MONEY': 'Monetary values, including unit', + 'QUANTITY': 'Measurements, as of weight or distance', + 'ORDINAL': '"first", "second", etc.', + 'CARDINAL': 'Numerals that do not fall under another type' +} From b1f22c5a102aa123a88d8fe617590cfe34e2ad2a Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:39:54 +0200 Subject: [PATCH 101/111] Fix formatting --- spacy/glossary.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/glossary.py b/spacy/glossary.py index 9df26a6af..4df5264a6 100644 --- a/spacy/glossary.py +++ b/spacy/glossary.py @@ -112,7 +112,7 @@ GLOSSARY = { 'CARD': 'cardinal number', 'FM': 'foreign language material', 'ITJ': 'interjection', - 'KOKOM': 'comparative conjunction ', + 'KOKOM': 'comparative conjunction', 'KON': 'coordinate conjunction', 'KOUI': 'subordinate conjunction with "zu" and infinitive', 'KOUS': 'subordinate conjunction with sentence', @@ -145,8 +145,8 @@ GLOSSARY = { 'VAINF': 'infinitive, auxiliary', 'VAPP': 'perfect participle, auxiliary', 'VMFIN': 'finite verb, modal', - 'VMINF': 'infinitive, modal ', - 'VMPP': 'perfect participle, modal ', + 'VMINF': 'infinitive, modal', + 'VMPP': 'perfect participle, modal', 'VVFIN': 'finite verb, full', 'VVIMP': 'imperative, full', 'VVINF': 'infinitive, full', From 41c6085a6cf68eb979a926dad4249f5ae9ac1821 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:40:08 +0200 Subject: [PATCH 102/111] Add pos-row and dep-row mixins to global mixins --- website/_includes/_mixins.jade | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 9766f5c5a..2f89b0ec4 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -285,7 +285,7 @@ mixin card-item(title, details) span.u-text-small.u-color-subtle by #{details.author} -//- Model row for models table +//- Table row for models table mixin model-row(name, lang, procon, size, license, default_model, divider) - var licenses = { "CC BY-SA": "https://creativecommons.org/licenses/by-sa/3.0/", "CC BY-NC": "https://creativecommons.org/licenses/by-nc/3.0/" } @@ -301,3 +301,21 @@ mixin model-row(name, lang, procon, size, license, default_model, divider) +cell if license in licenses +a(licenses[license])=license + + +//- Table rows for annotation specs + +mixin pos-row(tag, pos, morph, desc) + +row + +cell #[code=tag] + +cell #[code=pos] + +cell + each m in morph.split(" ") + if m + | #[code=m] + +cell.u-text-small=desc + +mixin dep-row(label, desc) + +row + +cell #[code=label] + +cell=desc From 06e414b3fc1c4c90255c571e2a51c4c25417eacb Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:40:30 +0200 Subject: [PATCH 103/111] Don't wrap inline code --- website/assets/css/_components/_code.sass | 1 + 1 file changed, 1 insertion(+) diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index fd7c20e33..83462ef72 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -34,6 +34,7 @@ margin: 0 border-radius: 1px box-decoration-break: clone + white-space: nowrap .c-aside__content & background: lighten($color-front, 10) From 7631d08d67ffaa2a4a24a71b57cb79eefd1b5b6b Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:40:47 +0200 Subject: [PATCH 104/111] Adjust saturation of light theme color --- website/assets/css/_variables.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_variables.sass b/website/assets/css/_variables.sass index 1c38d114a..5f9453ea6 100644 --- a/website/assets/css/_variables.sass +++ b/website/assets/css/_variables.sass @@ -34,7 +34,7 @@ $color-dark: lighten($color-front, 20) !default $color-theme: map-get($colors, $theme) $color-theme-dark: darken(map-get($colors, $theme), 5) -$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 15) +$color-theme-light: saturate(lighten(map-get($colors, $theme), 35), 5) $color-subtle: #ddd !default $color-subtle-light: #f6f6f6 !default From 0de98472b370cbf7d048f8873b8c76b71b754e3d Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:41:13 +0200 Subject: [PATCH 105/111] Increment CSS version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 0fd4d4a07..672640405 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -55,7 +55,7 @@ } }, - "V_CSS": "1.5", + "V_CSS": "1.6", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", From 219369bb7d00527e1954f47c59aa4cc62a48408d Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:41:38 +0200 Subject: [PATCH 106/111] Add detailed docs for dependency label annotations --- website/docs/api/_annotation/_dep-labels.jade | 113 ++++++++++++++++++ website/docs/api/annotation.jade | 11 +- 2 files changed, 114 insertions(+), 10 deletions(-) create mode 100644 website/docs/api/_annotation/_dep-labels.jade diff --git a/website/docs/api/_annotation/_dep-labels.jade b/website/docs/api/_annotation/_dep-labels.jade new file mode 100644 index 000000000..9e1e89324 --- /dev/null +++ b/website/docs/api/_annotation/_dep-labels.jade @@ -0,0 +1,113 @@ +//- 💫 DOCS > API > ANNOTATION > DEPENDENCY LABELS + ++infobox("Tip") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a label. For example, + | #[code spacy.explain("prt")] will return "particle". + ++h(3, "dependency-parsing-english") English dependency labels + +p + | The English dependency labels use the #[+a("http://www.clearnlp.com") ClearNLP] + | #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style]. + ++table(["Label", "Description"]) + +dep-row("acomp", "adjectival complement") + +dep-row("advcl", "adverbial clause modifier") + +dep-row("advmod", "adverbial modifier") + +dep-row("agent", "agent") + +dep-row("amod", "adjectival modifier") + +dep-row("appos", "appositional modifier") + +dep-row("attr", "attribute") + +dep-row("aux", "auxiliary") + +dep-row("auxpass", "auxiliary (passive)") + +dep-row("cc", "coordinating conjunction") + +dep-row("ccomp", "clausal complement") + +dep-row("complm", "complementizer") + +dep-row("conj", "conjunct") + +dep-row("cop", "copula") + +dep-row("csubj", "clausal subject") + +dep-row("csubjpass", "clausal subject (passive)") + +dep-row("dep", "unclassified dependent") + +dep-row("det", "determiner") + +dep-row("dobj", "direct object") + +dep-row("expl", "expletive") + +dep-row("hmod", "modifier in hyphenation") + +dep-row("hyph", "hyphen") + +dep-row("infmod", "infinitival modifier") + +dep-row("intj", "interjection") + +dep-row("iobj", "indirect object") + +dep-row("mark", "marker") + +dep-row("meta", "meta modifier") + +dep-row("neg", "negation modifier") + +dep-row("nmod", "modifier of nominal") + +dep-row("nn", "noun compound modifier") + +dep-row("npadvmod", "noun phrase as adverbial modifier") + +dep-row("nsubj", "nominal subject") + +dep-row("nsubjpass", "nominal subject (passive)") + +dep-row("num", "number modifier") + +dep-row("number", "number compound modifier") + +dep-row("oprd", "object predicate") + +dep-row("obj", "object") + +dep-row("obl", "oblique nominal") + +dep-row("parataxis", "parataxis") + +dep-row("partmod", "participal modifier") + +dep-row("pcomp", "complement of preposition") + +dep-row("pobj", "object of preposition") + +dep-row("poss", "possession modifier") + +dep-row("possessive", "possessive modifier") + +dep-row("preconj", "pre-correlative conjunction") + +dep-row("prep", "prepositional modifier") + +dep-row("prt", "particle") + +dep-row("punct", "punctuation") + +dep-row("quantmod", "modifier of quantifier") + +dep-row("rcmod", "relative clause modifier") + +dep-row("root", "root") + +dep-row("xcomp", "open clausal complement") + ++h(3, "dependency-parsing-german") German dependency labels + +p + | The German dependency labels use the + | #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank] + | annotation scheme. + ++table(["Label", "Description"]) + +dep-row("ac", "adpositional case marker") + +dep-row("adc", "adjective component") + +dep-row("ag", "genitive attribute") + +dep-row("ams", "measure argument of adjective") + +dep-row("app", "apposition") + +dep-row("avc", "adverbial phrase component") + +dep-row("cc", "comparative complement") + +dep-row("cd", "coordinating conjunction") + +dep-row("cj", "conjunct") + +dep-row("cm", "comparative conjunction") + +dep-row("cp", "complementizer") + +dep-row("cvc", "collocational verb construction") + +dep-row("da", "dative") + +dep-row("dh", "discourse-level head") + +dep-row("dm", "discourse marker") + +dep-row("ep", "expletive es") + +dep-row("hd", "head") + +dep-row("ju", "junctor") + +dep-row("mnr", "postnominal modifier") + +dep-row("mo", "modifier") + +dep-row("ng", "negation") + +dep-row("nk", "noun kernel element") + +dep-row("nmc", "numerical component") + +dep-row("oa", "accusative object") + +dep-row("oa", "second accusative object") + +dep-row("oc", "clausal object") + +dep-row("og", "genitive object") + +dep-row("op", "prepositional object") + +dep-row("par", "parenthetical element") + +dep-row("pd", "predicate") + +dep-row("pg", "phrasal genitive") + +dep-row("ph", "placeholder") + +dep-row("pm", "morphological particle") + +dep-row("pnc", "proper noun component") + +dep-row("rc", "relative clause") + +dep-row("re", "repeated element") + +dep-row("rs", "reported speech") + +dep-row("sb", "subject") diff --git a/website/docs/api/annotation.jade b/website/docs/api/annotation.jade index adc6b28f7..8c6b8fb10 100644 --- a/website/docs/api/annotation.jade +++ b/website/docs/api/annotation.jade @@ -65,16 +65,7 @@ p +h(2, "dependency-parsing") Syntactic Dependency Parsing -+table(["Language", "Converter", "Scheme"]) - +row - +cell English - +cell #[+a("http://www.clearnlp.com") ClearNLP] - +cell #[+a("http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf") CLEAR Style] - - +row - +cell German - +cell #[+a("https://github.com/wbwseeker/tiger2dep") TIGER] - +cell #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER] +include _annotation/_dep-labels +h(2, "named-entities") Named Entity Recognition From 1570b83ee5cbc61d96a9943a1ae4145c7168c2e9 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:41:49 +0200 Subject: [PATCH 107/111] Add spacy.explain() note to NER annotation scheme --- website/docs/api/_annotation/_named-entities.jade | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/api/_annotation/_named-entities.jade b/website/docs/api/_annotation/_named-entities.jade index 476659d4a..68b3bd17d 100644 --- a/website/docs/api/_annotation/_named-entities.jade +++ b/website/docs/api/_annotation/_named-entities.jade @@ -1,5 +1,10 @@ //- 💫 DOCS > API > ANNOTATION > NAMED ENTITIES ++infobox("Tip") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of an entity label. For example, + | #[code spacy.explain("LANGUAGE")] will return "any named language". + +table([ "Type", "Description" ]) +row +cell #[code PERSON] From fac3566aace5bce98a1c2e29d314494a28bed2ff Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 19:42:01 +0200 Subject: [PATCH 108/111] Add descriptions to POS tagging scheme --- website/docs/api/_annotation/_pos-tags.jade | 253 ++++++++++---------- 1 file changed, 128 insertions(+), 125 deletions(-) diff --git a/website/docs/api/_annotation/_pos-tags.jade b/website/docs/api/_annotation/_pos-tags.jade index d3f561c3f..d3ceef777 100644 --- a/website/docs/api/_annotation/_pos-tags.jade +++ b/website/docs/api/_annotation/_pos-tags.jade @@ -1,136 +1,139 @@ //- 💫 DOCS > API > ANNOTATION > POS TAGS -mixin pos-row(...row) - +row - each cell in row - +cell - each item in cell.split(" ") - if item - | #[code=item] ++infobox("Tip") + | In spaCy v1.8.3+, you can also use #[code spacy.explain()] to get the + | description for the string representation of a tag. For example, + | #[code spacy.explain("RB")] will return "adverb". + ++h(3, "pos-tagging-english") English part-of-speech tag scheme p - | The part-of-speech tagger uses the + | The English part-of-speech tagger uses the | #[+a("https://catalog.ldc.upenn.edu/LDC2013T19") OntoNotes 5] version of | the Penn Treebank tag set. We also map the tags to the simpler Google | Universal POS tag set. -+h(3, "pos-tagging-english") English part-of-speech tag scheme - -+table(["Tag", "POS", "Morphology"]) - +pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini") - +pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin") - +pos-row(",", "PUNCT", "PunctType=comm") - +pos-row(":", "PUNCT", "") - +pos-row(".", "PUNCT", "PunctType=peri") - +pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin") - +pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin") - +pos-row("#", "SYM", "SymType=numbersign") - +pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini") - +pos-row("$", "SYM", "SymType=currency") - +pos-row("ADD", "X", "") - +pos-row("AFX", "ADJ", "Hyph=yes") - +pos-row("BES", "VERB", "") - +pos-row("CC", "CONJ", "ConjType=coor") - +pos-row("CD", "NUM", "NumType=card") - +pos-row("DT", "DET", "") - +pos-row("EX", "ADV", "AdvType=ex") - +pos-row("FW", "X", "Foreign=yes") - +pos-row("GW", "X", "") - +pos-row("HVS", "VERB", "") - +pos-row("HYPH", "PUNCT", "PunctType=dash") - +pos-row("IN", "ADP", "") - +pos-row("JJ", "ADJ", "Degree=pos") - +pos-row("JJR", "ADJ", "Degree=comp") - +pos-row("JJS", "ADJ", "Degree=sup") - +pos-row("LS", "PUNCT", "NumType=ord") - +pos-row("MD", "VERB", "VerbType=mod") - +pos-row("NFP", "PUNCT", "") - +pos-row("NIL", "", "") - +pos-row("NN", "NOUN", "Number=sing") - +pos-row("NNP", "PROPN", "NounType=prop Number=sign") - +pos-row("NNPS", "PROPN", "NounType=prop Number=plur") - +pos-row("NNS", "NOUN", "Number=plur") - +pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn") - +pos-row("POS", "PART", "Poss=yes") - +pos-row("PRP", "PRON", "PronType=prs") - +pos-row("PRP$", "ADJ", "PronType=prs Poss=yes") - +pos-row("RB", "ADV", "Degree=pos") - +pos-row("RBR", "ADV", "Degree=comp") - +pos-row("RBS", "ADV", "Degree=sup") - +pos-row("RP", "PART", "") - +pos-row("SP", "SPACE", "") - +pos-row("SYM", "SYM", "") - +pos-row("TO", "PART", "PartType=inf VerbForm=inf") - +pos-row("UH", "INTJ", "") - +pos-row("VB", "VERB", "VerbForm=inf") - +pos-row("VBD", "VERB", "VerbForm=fin Tense=past") - +pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog") - +pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf") - +pos-row("VBP", "VERB", "VerbForm=fin Tense=pres") - +pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3") - +pos-row("WDT", "ADJ", "PronType=int|rel") - +pos-row("WP", "NOUN", "PronType=int|rel") - +pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel") - +pos-row("WRB", "ADV", "PronType=int|rel") - +pos-row("XX", "X", "") ++table(["Tag", "POS", "Morphology", "Description"]) + +pos-row("-LRB-", "PUNCT", "PunctType=brck PunctSide=ini", "left round bracket") + +pos-row("-PRB-", "PUNCT", "PunctType=brck PunctSide=fin", "right round bracket") + +pos-row(",", "PUNCT", "PunctType=comm", "punctuation mark, comma") + +pos-row(":", "PUNCT", "", "punctuation mark, colon or ellipsis") + +pos-row(".", "PUNCT", "PunctType=peri", "punctuation mark, sentence closer") + +pos-row("''", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark") + +pos-row("\"\"", "PUNCT", "PunctType=quot PunctSide=fin", "closing quotation mark") + +pos-row("#", "SYM", "SymType=numbersign", "symbol, number sign") + +pos-row("``", "PUNCT", "PunctType=quot PunctSide=ini", "opening quotation mark") + +pos-row("$", "SYM", "SymType=currency", "symbol, currency") + +pos-row("ADD", "X", "", "email") + +pos-row("AFX", "ADJ", "Hyph=yes", "affix") + +pos-row("BES", "VERB", "", 'auxillary "be"') + +pos-row("CC", "CONJ", "ConjType=coor", "conjunction, coordinating") + +pos-row("CD", "NUM", "NumType=card", "cardinal number") + +pos-row("DT", "DET", "determiner") + +pos-row("EX", "ADV", "AdvType=ex", "existential there") + +pos-row("FW", "X", "Foreign=yes", "foreign word") + +pos-row("GW", "X", "", "additional word in multi-word expression") + +pos-row("HVS", "VERB", "", 'forms of "have"') + +pos-row("HYPH", "PUNCT", "PunctType=dash", "punctuation mark, hyphen") + +pos-row("IN", "ADP", "", "conjunction, subordinating or preposition") + +pos-row("JJ", "ADJ", "Degree=pos", "adjective") + +pos-row("JJR", "ADJ", "Degree=comp", "adjective, comparative") + +pos-row("JJS", "ADJ", "Degree=sup", "adjective, superlative") + +pos-row("LS", "PUNCT", "NumType=ord", "list item marker") + +pos-row("MD", "VERB", "VerbType=mod", "verb, modal auxillary") + +pos-row("NFP", "PUNCT", "", "superfluous punctuation") + +pos-row("NIL", "", "", "missing tag") + +pos-row("NN", "NOUN", "Number=sing", "noun, singular or mass") + +pos-row("NNP", "PROPN", "NounType=prop Number=sign", "noun, proper singular") + +pos-row("NNPS", "PROPN", "NounType=prop Number=plur", "noun, proper plural") + +pos-row("NNS", "NOUN", "Number=plur", "noun, plural") + +pos-row("PDT", "ADJ", "AdjType=pdt PronType=prn", "predeterminer") + +pos-row("POS", "PART", "Poss=yes", "possessive ending") + +pos-row("PRP", "PRON", "PronType=prs", "pronoun, personal") + +pos-row("PRP$", "ADJ", "PronType=prs Poss=yes", "pronoun, possessive") + +pos-row("RB", "ADV", "Degree=pos", "adverb") + +pos-row("RBR", "ADV", "Degree=comp", "adverb, comparative") + +pos-row("RBS", "ADV", "Degree=sup", "adverb, superlative") + +pos-row("RP", "PART", "", "adverb, particle") + +pos-row("SP", "SPACE", "", "space") + +pos-row("SYM", "SYM", "", "symbol") + +pos-row("TO", "PART", "PartType=inf VerbForm=inf", "infinitival to") + +pos-row("UH", "INTJ", "", "interjection") + +pos-row("VB", "VERB", "VerbForm=inf", "verb, base form") + +pos-row("VBD", "VERB", "VerbForm=fin Tense=past", "verb, past tense") + +pos-row("VBG", "VERB", "VerbForm=part Tense=pres Aspect=prog", "verb, gerund or present participle") + +pos-row("VBN", "VERB", "VerbForm=part Tense=past Aspect=perf", "verb, past participle") + +pos-row("VBP", "VERB", "VerbForm=fin Tense=pres", "verb, non-3rd person singular present") + +pos-row("VBZ", "VERB", "VerbForm=fin Tense=pres Number=sing Person=3", "verb, 3rd person singular present") + +pos-row("WDT", "ADJ", "PronType=int|rel", "wh-determiner") + +pos-row("WP", "NOUN", "PronType=int|rel", "wh-pronoun, personal") + +pos-row("WP$", "ADJ", "Poss=yes PronType=int|rel", "wh-pronoun, possessive") + +pos-row("WRB", "ADV", "PronType=int|rel", "wh-adverb") + +pos-row("XX", "X", "", "unknown") +h(3, "pos-tagging-german") German part-of-speech tag scheme -+table(["Tag", "POS", "Morphology"]) - +pos-row("$(", "PUNCT", "PunctType=brck") - +pos-row("$,", "PUNCT", "PunctType=comm") - +pos-row("$.", "PUNCT", "PunctType=peri") - +pos-row("ADJA", "ADJ", "") - +pos-row("ADJD", "ADJ", "Variant=short") - +pos-row("ADV", "ADV", "") - +pos-row("APPO", "ADP", "AdpType=post") - +pos-row("APPR", "ADP", "AdpType=prep") - +pos-row("APPRART", "ADP", "AdpType=prep PronType=art") - +pos-row("APZR", "ADP", "AdpType=circ") - +pos-row("ART", "DET", "PronType=art") - +pos-row("CARD", "NUM", "NumType=card") - +pos-row("FM", "X", "Foreign=yes") - +pos-row("ITJ", "INTJ", "") - +pos-row("KOKOM", "CONJ", "ConjType=comp") - +pos-row("KON", "CONJ", "") - +pos-row("KOUI", "SCONJ", "") - +pos-row("KOUS", "SCONJ", "") - +pos-row("NE", "PROPN", "") - +pos-row("NNE", "PROPN", "") - +pos-row("NN", "NOUN", "") - +pos-row("PAV", "ADV", "PronType=dem") - +pos-row("PROAV", "ADV", "PronType=dem") - +pos-row("PDAT", "DET", "PronType=dem") - +pos-row("PDS", "PRON", "PronType=dem") - +pos-row("PIAT", "DET", "PronType=ind|neg|tot") - +pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot") - +pos-row("PIS", "PRON", "PronType=ind|neg|tot") - +pos-row("PPER", "PRON", "PronType=prs") - +pos-row("PPOSAT", "DET", "Poss=yes PronType=prs") - +pos-row("PPOSS", "PRON", "PronType=rel") - +pos-row("PRELAT", "DET", "PronType=rel") - +pos-row("PRELS", "PRON", "PronType=rel") - +pos-row("PRF", "PRON", "PronType=prs Reflex=yes") - +pos-row("PTKA", "PART", "") - +pos-row("PTKANT", "PART", "PartType=res") - +pos-row("PTKNEG", "PART", "Negative=yes") - +pos-row("PTKVZ", "PART", "PartType=vbp") - +pos-row("PTKZU", "PART", "PartType=inf") - +pos-row("PWAT", "DET", "PronType=int") - +pos-row("PWAV", "ADV", "PronType=int") - +pos-row("PWS", "PRON", "PronType=int") - +pos-row("TRUNC", "X", "Hyph=yes") - +pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin") - +pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin") - +pos-row("VAINF", "AUX", "VerbForm=inf") - +pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin") - +pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod") - +pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod") - +pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod") - +pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin") - +pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin") - +pos-row("VVINF", "VERB", "VerbForm=inf") - +pos-row("VVIZU", "VERB", "VerbForm=inf") - +pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part") - +pos-row("XY", "X", "") - +pos-row("SP", "SPACE", "") +p + | The German part-of-speech tagger uses the + | #[+a("http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html") TIGER Treebank] + | annotation scheme. We also map the tags to the simpler Google + | Universal POS tag set. + ++table(["Tag", "POS", "Morphology", "Description"]) + +pos-row("$(", "PUNCT", "PunctType=brck", "other sentence-internal punctuation mark") + +pos-row("$,", "PUNCT", "PunctType=comm", "comma") + +pos-row("$.", "PUNCT", "PunctType=peri", "sentence-final punctuation mark") + +pos-row("ADJA", "ADJ", "", "adjective, attributive") + +pos-row("ADJD", "ADJ", "Variant=short", "adjective, adverbial or predicative") + +pos-row("ADV", "ADV", "", "adverb") + +pos-row("APPO", "ADP", "AdpType=post", "postposition") + +pos-row("APPR", "ADP", "AdpType=prep", "preposition; circumposition left") + +pos-row("APPRART", "ADP", "AdpType=prep PronType=art", "preposition with article") + +pos-row("APZR", "ADP", "AdpType=circ", "circumposition right") + +pos-row("ART", "DET", "PronType=art", "definite or indefinite article") + +pos-row("CARD", "NUM", "NumType=card", "cardinal number") + +pos-row("FM", "X", "Foreign=yes", "foreign language material") + +pos-row("ITJ", "INTJ", "", "interjection") + +pos-row("KOKOM", "CONJ", "ConjType=comp", "comparative conjunction") + +pos-row("KON", "CONJ", "", "coordinate conjunction") + +pos-row("KOUI", "SCONJ", "", 'subordinate conjunction with "zu" and infinitive') + +pos-row("KOUS", "SCONJ", "", "subordinate conjunction with sentence") + +pos-row("NE", "PROPN", "", "proper noun") + +pos-row("NNE", "PROPN", "", "proper noun") + +pos-row("NN", "NOUN", "", "noun, singular or mass") + +pos-row("PAV", "ADV", "PronType=dem", "pronominal adverb") + +pos-row("PROAV", "ADV", "PronType=dem", "pronominal adverb") + +pos-row("PDAT", "DET", "PronType=dem", "attributive demonstrative pronoun") + +pos-row("PDS", "PRON", "PronType=dem", "substituting demonstrative pronoun") + +pos-row("PIAT", "DET", "PronType=ind|neg|tot", "attributive indefinite pronoun without determiner") + +pos-row("PIDAT", "DET", "AdjType=pdt PronType=ind|neg|tot", "attributive indefinite pronoun with determiner") + +pos-row("PIS", "PRON", "PronType=ind|neg|tot", "substituting indefinite pronoun") + +pos-row("PPER", "PRON", "PronType=prs", "non-reflexive personal pronoun") + +pos-row("PPOSAT", "DET", "Poss=yes PronType=prs", "attributive possessive pronoun") + +pos-row("PPOSS", "PRON", "PronType=rel", "substituting possessive pronoun") + +pos-row("PRELAT", "DET", "PronType=rel", "attributive relative pronoun") + +pos-row("PRELS", "PRON", "PronType=rel", "substituting relative pronoun") + +pos-row("PRF", "PRON", "PronType=prs Reflex=yes", "reflexive personal pronoun") + +pos-row("PTKA", "PART", "", "particle with adjective or adverb") + +pos-row("PTKANT", "PART", "PartType=res", "answer particle") + +pos-row("PTKNEG", "PART", "Negative=yes", "negative particle") + +pos-row("PTKVZ", "PART", "PartType=vbp", "separable verbal particle") + +pos-row("PTKZU", "PART", "PartType=inf", '"zu" before infinitive') + +pos-row("PWAT", "DET", "PronType=int", "attributive interrogative pronoun") + +pos-row("PWAV", "ADV", "PronType=int", "adverbial interrogative or relative pronoun") + +pos-row("PWS", "PRON", "PronType=int", "substituting interrogative pronoun") + +pos-row("TRUNC", "X", "Hyph=yes", "word remnant") + +pos-row("VAFIN", "AUX", "Mood=ind VerbForm=fin", "finite verb, auxiliary") + +pos-row("VAIMP", "AUX", "Mood=imp VerbForm=fin", "imperative, auxiliary") + +pos-row("VAINF", "AUX", "VerbForm=inf", "infinitive, auxiliary") + +pos-row("VAPP", "AUX", "Aspect=perf VerbForm=fin", "perfect participle, auxiliary") + +pos-row("VMFIN", "VERB", "Mood=ind VerbForm=fin VerbType=mod", "finite verb, modal") + +pos-row("VMINF", "VERB", "VerbForm=fin VerbType=mod", "infinitive, modal") + +pos-row("VMPP", "VERB", "Aspect=perf VerbForm=part VerbType=mod", "perfect participle, modal") + +pos-row("VVFIN", "VERB", "Mood=ind VerbForm=fin", "finite verb, full") + +pos-row("VVIMP", "VERB", "Mood=imp VerbForm=fin", "imperative, full") + +pos-row("VVINF", "VERB", "VerbForm=inf", "infinitive, full") + +pos-row("VVIZU", "VERB", "VerbForm=inf", 'infinitive with "zu", full') + +pos-row("VVPP", "VERB", "Aspect=perf VerbForm=part", "perfect participle, full") + +pos-row("XY", "X", "", "non-word containing non-letter") + +pos-row("SP", "SPACE", "", "space") From a793174ae92f0802970cf19821e24a1004af28d0 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 3 May 2017 20:10:59 +0200 Subject: [PATCH 109/111] Use setuptools.find_packages() --- setup.py | 44 ++------------------------------------------ 1 file changed, 2 insertions(+), 42 deletions(-) diff --git a/setup.py b/setup.py index 52ce06843..3203e38a7 100755 --- a/setup.py +++ b/setup.py @@ -8,53 +8,13 @@ import contextlib from distutils.command.build_ext import build_ext from distutils.sysconfig import get_python_inc from distutils import ccompiler, msvccompiler - -try: - from setuptools import Extension, setup -except ImportError: - from distutils.core import Extension, setup +from setuptools import Extension, setup, find_packages PACKAGE_DATA = {'': ['*.pyx', '*.pxd', '*.txt', '*.tokens']} -PACKAGES = [ - 'spacy', - 'spacy.data', - 'spacy.cli', - 'spacy.tokens', - 'spacy.en', - 'spacy.de', - 'spacy.zh', - 'spacy.es', - 'spacy.fr', - 'spacy.it', - 'spacy.hu', - 'spacy.pt', - 'spacy.nl', - 'spacy.sv', - 'spacy.fi', - 'spacy.bn', - 'spacy.he', - 'spacy.nb', - 'spacy.ja', - 'spacy.en.lemmatizer', - 'spacy.cli.converters', - 'spacy.language_data', - 'spacy.serialize', - 'spacy.syntax', - 'spacy.munge', - 'spacy.tests', - 'spacy.tests.matcher', - 'spacy.tests.parser', - 'spacy.tests.serialize', - 'spacy.tests.spans', - 'spacy.tests.stringstore', - 'spacy.tests.tagger', - 'spacy.tests.tokenizer', - 'spacy.tests.doc', - 'spacy.tests.vectors', - 'spacy.tests.vocab'] +PACKAGES = find_packages() MOD_NAMES = [ From 229b8c39742ce908ab63acd328b0a58dfc7083ec Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 7 May 2017 18:36:35 +0200 Subject: [PATCH 110/111] Tidy up --- setup.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 3203e38a7..a112a6e80 100755 --- a/setup.py +++ b/setup.py @@ -67,15 +67,6 @@ LINK_OPTIONS = { # I don't understand this very well yet. See Issue #267 # Fingers crossed! -#if os.environ.get('USE_OPENMP') == '1': -# compile_options['msvc'].append('/openmp') -# -# -#if not sys.platform.startswith('darwin'): -# compile_options['other'].append('-fopenmp') -# link_options['other'].append('-fopenmp') -# - USE_OPENMP_DEFAULT = '1' if sys.platform != 'darwin' else None if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': if sys.platform == 'darwin': @@ -91,6 +82,7 @@ if os.environ.get('USE_OPENMP', USE_OPENMP_DEFAULT) == '1': COMPILE_OPTIONS['other'].append('-fopenmp') LINK_OPTIONS['other'].append('-fopenmp') + # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used class build_ext_options: From 76ebd0fe5ce78dafdaed87ea2d4508af5150ecb1 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 7 May 2017 18:37:36 +0200 Subject: [PATCH 111/111] Update CONTRIBUTING.md --- CONTRIBUTING.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 837b0a469..8a9ab517b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -43,6 +43,7 @@ To distinguish issues that are opened by us, the maintainers, we usually add a | [`models`](https://github.com/explosion/spaCy/labels/models), `language / [name]` | Issues related to the specific [models](https://github.com/explosion/spacy-models), languages and data | | [`linux`](https://github.com/explosion/spaCy/labels/linux), [`osx`](https://github.com/explosion/spaCy/labels/osx), [`windows`](https://github.com/explosion/spaCy/labels/windows) | Issues related to the specific operating systems | | [`pip`](https://github.com/explosion/spaCy/labels/pip), [`conda`](https://github.com/explosion/spaCy/labels/conda) | Issues related to the specific package managers | +| [`wip`](https://github.com/explosion/spaCy/labels/wip) | Work in progress | | [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before | | [`meta`](https://github.com/explosion/spaCy/labels/meta) | Meta topics, e.g. repo organisation and issue management | | [`help wanted`](https://github.com/explosion/spaCy/labels/help%20wanted), [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions | @@ -75,7 +76,7 @@ example_user would create the file `.github/contributors/example_user.md`. ### Fixing bugs -When fixing a bug, first create an [issue](https://github.com/explosion/spaCy/issues) if one does not already exist. The description text can be very short – we don't want to make this too bureaucratic. +When fixing a bug, first create an [issue](https://github.com/explosion/spaCy/issues) if one does not already exist. The description text can be very short – we don't want to make this too bureaucratic. Next, create a test file named `test_issue[ISSUE NUMBER].py` in the [`spacy/tests/regression`](spacy/tests/regression) folder. Test for the bug you're fixing, and make sure the test fails. Next, add and commit your test file referencing the issue number in the commit message. Finally, fix the bug, make sure your test passes and reference the issue in your commit message. @@ -177,7 +178,7 @@ harp server The docs can always use another example or more detail, and they should always be up to date and not misleading. To quickly find the correct file to edit, simply click on the "Suggest edits" button at the bottom of a page. -To make it easy to add content components, we use a [collection of custom mixins](_includes/_mixins.jade), like `+table`, `+list` or `+code`. +To make it easy to add content components, we use a [collection of custom mixins](_includes/_mixins.jade), like `+table`, `+list` or `+code`. 📖 **For more info and troubleshooting guides, check out the [website README](website).**