From e2ed2f02582da9d19d39ada8a3ea819b9b6f4546 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:51:21 +0200 Subject: [PATCH 01/21] Bump version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index 82e82093e..bb0489e64 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -55,7 +55,7 @@ } }, - "V_CSS": "1.2", + "V_CSS": "1.3", "V_JS": "1.2", "DEFAULT_SYNTAX": "python", "ANALYTICS": "UA-58931649-1", From 7ceaa1614b8eb140c05a4e100165af195604451f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:51:40 +0200 Subject: [PATCH 02/21] Add experimental model init command --- spacy/__main__.py | 17 +++++- spacy/cli/__init__.py | 1 + spacy/cli/model.py | 129 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 spacy/cli/model.py diff --git a/spacy/__main__.py b/spacy/__main__.py index 7ec3f535a..a805c984d 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -9,12 +9,13 @@ from spacy.cli import link as cli_link from spacy.cli import info as cli_info from spacy.cli import package as cli_package from spacy.cli import train as cli_train +from spacy.cli import model as cli_model class CLI(object): """Command-line interface for spaCy""" - commands = ('download', 'link', 'info', 'package', 'train') + commands = ('download', 'link', 'info', 'package', 'train', 'model') @plac.annotations( model=("model to download (shortcut or model name)", "positional", None, str), @@ -95,6 +96,20 @@ class CLI(object): cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, not no_parser, not no_ner, parser_L1) + @plac.annotations( + lang=("model language", "positional", None, str), + model_dir=("output directory to store model in", "positional", None, str), + freqs_data=("tab-separated frequencies file", "positional", None, str), + clusters_data=("Brown clusters file", "positional", None, str), + vectors_data=("word vectors file", "positional", None, str) + ) + def model(self, lang, model_dir, freqs_data, clusters_data=None, vectors_data=None): + """ + Initialize a new model and its data directory. + """ + + cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) + def __missing__(self, name): print("\n Command %r does not exist." diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py index a4bc57ea9..b97279dec 100644 --- a/spacy/cli/__init__.py +++ b/spacy/cli/__init__.py @@ -3,3 +3,4 @@ from .info import info from .link import link from .package import package from .train import train, train_config +from .model import model diff --git a/spacy/cli/model.py b/spacy/cli/model.py new file mode 100644 index 000000000..350023d5a --- /dev/null +++ b/spacy/cli/model.py @@ -0,0 +1,129 @@ +# coding: utf8 +from __future__ import unicode_literals + +import gzip +import math +from ast import literal_eval +from pathlib import Path +from preshed.counter import PreshCounter + +from ..vocab import Vocab, write_binary_vectors +from .. import util + + +def model(lang, model_dir, freqs_data, clusters_data, vectors_data): + model_path = Path(model_dir) + freqs_path = Path(freqs_data) + clusters_path = Path(clusters_data) if clusters_data else None + vectors_path = Path(vectors_data) if vectors_data else None + + check_dirs(freqs_path, clusters_path, vectors_path) + vocab = util.get_lang_class(lang).Defaults.create_vocab() + probs, oov_prob = read_probs(freqs_path) + clusters = read_clusters(clusters_path) if clusters_path else {} + populate_vocab(vocab, clusters, probs, oov_prob) + create_model(model_path, vectors_path, vocab, oov_prob) + + +def create_model(model_path, vectors_path, vocab, oov_prob): + vocab_path = model_path / 'vocab' + lexemes_path = vocab_path / 'lexemes.bin' + strings_path = vocab_path / 'strings.json' + oov_path = vocab_path / 'oov_prob' + + if not model_path.exists(): + model_path.mkdir() + if not vocab_path.exists(): + vocab_path.mkdir() + vocab.dump(lexemes_path.as_posix()) + with strings_path.open('w') as f: + vocab.strings.dump(f) + with oov_path.open('w') as f: + f.write('%f' % oov_prob) + if vectors_path: + vectors_dest = model_path / 'vec.bin' + write_binary_vectors(vectors_path.as_posix(), vectors_dest.as_posix()) + + +def read_probs(freqs_path, max_length=100, min_doc_freq=5, min_freq=200): + counts = PreshCounter() + total = 0 + freqs_file = check_unzip(freqs_path) + for i, line in enumerate(freqs_file): + freq, doc_freq, key = line.rstrip().split('\t', 2) + freq = int(freq) + counts.inc(i+1, freq) + total += freq + counts.smooth() + log_total = math.log(total) + freqs_file = check_unzip(freqs_path) + probs = {} + for line in freqs_file: + freq, doc_freq, key = line.rstrip().split('\t', 2) + doc_freq = int(doc_freq) + freq = int(freq) + if doc_freq >= min_doc_freq and freq >= min_freq and len(key) < max_length: + word = literal_eval(key) + smooth_count = counts.smoother(int(freq)) + probs[word] = math.log(smooth_count) - log_total + oov_prob = math.log(counts.smoother(0)) - log_total + return probs, oov_prob + + +def read_clusters(clusters_path): + clusters = {} + with clusters_path.open() as f: + for line in f: + try: + cluster, word, freq = line.split() + except ValueError: + continue + # If the clusterer has only seen the word a few times, its + # cluster is unreliable. + if int(freq) >= 3: + clusters[word] = cluster + else: + clusters[word] = '0' + # Expand clusters with re-casing + for word, cluster in list(clusters.items()): + if word.lower() not in clusters: + clusters[word.lower()] = cluster + if word.title() not in clusters: + clusters[word.title()] = cluster + if word.upper() not in clusters: + clusters[word.upper()] = cluster + return clusters + + +def populate_vocab(vocab, clusters, probs, oov_probs): + # Ensure probs has entries for all words seen during clustering. + for word in clusters: + if word not in probs: + probs[word] = oov_prob + for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False + # Decode as a little-endian string, so that we can do & 15 to get + # the first 4 bits. See _parse_features.pyx + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 + + +def check_unzip(file_path): + file_path_str = file_path.as_posix() + if file_path_str.endswith('gz'): + return gzip.open(file_path_str) + else: + return file_path.open() + + +def check_dirs(freqs_data, clusters_data, vectors_data): + if not freqs_data.is_file(): + util.sys_exit(freqs_data.as_posix(), title="No frequencies file found") + if clusters_data and not clusters_data.is_file(): + util.sys_exit(clusters_data.as_posix(), title="No Brown clusters file found") + if vectors_data and not vectors_data.is_file(): + util.sys_exit(vectors_data.as_posix(), title="No word vectors file found") From 7198cf1c8aa9d3d63772b263ec0cce1cbca15083 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 26 Mar 2017 20:56:05 +0200 Subject: [PATCH 03/21] Remove unused import --- spacy/cli/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 350023d5a..4cfd9a6f6 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -7,7 +7,7 @@ from ast import literal_eval from pathlib import Path from preshed.counter import PreshCounter -from ..vocab import Vocab, write_binary_vectors +from ..vocab import write_binary_vectors from .. import util From 17a1e7a11961359839ff66785dc3fe0310d3a082 Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 12:21:01 +0100 Subject: [PATCH 04/21] Add Portuguese numbers and ordinals --- spacy/pt/stop_words.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 1faa4a270..30ac19c45 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -66,3 +66,22 @@ vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós zero """.split()) + + +# Number words + +NUM_WORDS = set(""" +zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze +quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta +sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo +vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo +octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo +quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo +milésimo milionésimo bilionésimo +""".split()) From c1d020b0a68630bbc9950b09ad72ba38ee1c749f Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 12:26:13 +0100 Subject: [PATCH 05/21] Remove "ista" from portuguese stop words --- spacy/pt/stop_words.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 30ac19c45..7c21f760b 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -27,7 +27,7 @@ geral grande grandes grupo hoje horas há -iniciar inicio ir irá isso ista isto já +iniciar inicio ir irá isso isto já lado ligado local logo longe lugar lá From 465b240bcb0ccfa445571830eaee923bfa2638cb Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 13:00:29 +0100 Subject: [PATCH 06/21] Review Portuguese stop words Mainly to review typos and add missing masculines/feminines --- spacy/pt/stop_words.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 7c21f760b..a24356881 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -3,18 +3,19 @@ from __future__ import unicode_literals STOP_WORDS = set(""" -à às acerca adeus agora ainda algo algumas alguns ali além ambos ano -anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui -aquilo area área as assim através atrás até aí +à às acerca adeus agora ainda algo algumas alguns ali além ambas ambos ano +anos antes ao aos apenas apoio apoia apontar após aquela aquelas aquele aqueles +aqui aquilo área as assim através atrás até aí -baixo bastante bem bom breve +baixo bastante bem boa bom breve cada caminho catorze cedo cento certamente certeza cima cinco coisa com como -comprido conhecido conselho contra corrente custa cá +comprido comprida conhecida conhecido conselho contra corrente custa cá -da daquela daquele dar das de debaixo demais dentro depois desde desligado -dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete -dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida +da daquela daquele dar das de debaixo demais dentro depois desde desligada +desligado dessa desse desta deste deve devem deverá dez dezanove dezasseis +dezassete dezoito dia diante direita diz dizem dizer do dois dos doze duas dá +dão dúvida é ela elas ele eles em embora enquanto entre então era és essa essas esse esses esta estado estar estará estas estava este estes esteve estive estivemos @@ -35,34 +36,34 @@ maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus mil minha minhas momento muito muitos máximo mês na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome -nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós -número +nos nossa nossas nosso nossos nova novas nove novo novos num numa nunca nuns +não nível nós número números obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras outro outros para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem poder poderá podia ponto pontos por porque porquê posição possivelmente posso -possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam -pôde põe põem +possível pouca pouco povo primeira primeiro próprio próxima próximo puderam pôde +põe põem -qual qualquer quando quanto quarta quarto quatro que quem quer quero questão -quieto quinta quinto quinze quê +qual qualquer quando quanto quarta quarto quatro que quem quer querem quero +questão quieta quieto quinta quinto quinze quê relação sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo -tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar +tal talvez também tanta tanto tarde te tem temos tempo tendes tenho tens tentar tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu tua tuas tudo tão têm último um uma umas uns usa usar -vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo -vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós +vai vais valor veja vem vens ver verdade verdadeira verdadeiro vez vezes viagem +vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós zero """.split()) From 4fde64c4eac68dd741c9abeb6bca4fb25698288a Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Fri, 31 Mar 2017 15:52:55 +0100 Subject: [PATCH 07/21] Portuguese contractions and some abreviations --- spacy/pt/tokenizer_exceptions.py | 111 +++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 spacy/pt/tokenizer_exceptions.py diff --git a/spacy/pt/tokenizer_exceptions.py b/spacy/pt/tokenizer_exceptions.py new file mode 100644 index 000000000..1e02f6c6e --- /dev/null +++ b/spacy/pt/tokenizer_exceptions.py @@ -0,0 +1,111 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + +TOKENIZER_EXCEPTIONS = {} + +# Contractions +CONTRACTIONS = {} + +personal_pronoun = ( + "ele", "ela", "eles", "elas" +) +demonstrative_pronouns = ( + "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas", + "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo" +) +undefined_pronouns = ( + "outro", "outra", "outros", "outras" +) +adverbs = ( + "aqui", "aí", "ali", "além" +) + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns + adverbs: + CONTRACTIONS["d" + word] = [ + {ORTH: "d", NORM: "de"}, + {ORTH: word} + ] + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns: + CONTRACTIONS["n" + word] = [ + {ORTH: "n", NORM: "em"}, + {ORTH: word} + ] + +# Not so linear contractions "a"+something + +CONTRACTIONS.update({ + # This one cannot be split into 2 + # "à": [ + # {ORTH: "à", NORM: "a"}, + # {ORTH: "", NORM: "a"} + # ], + "às": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "s", NORM: "as"} + ], + "ao": [ + {ORTH: "a"}, + {ORTH: "o"} + ], + "aos": [ + {ORTH: "a"}, + {ORTH: "os"} + ], + "àquele": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quele", NORM: "aquele"} + ], + "àquela": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quela", NORM: "aquela"} + ], + "àqueles": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "queles", NORM: "aqueles"} + ], + "àquelas": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quelas", NORM: "aquelas"} + ], + "àquilo": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quilo", NORM: "aquilo"} + ], + "aonde": [ + {ORTH: "a"}, + {ORTH: "onde"} + ], +}) + +TOKENIZER_EXCEPTIONS.update(CONTRACTIONS) + +# Abbreviations with only one ORTH token + +ORTH_ONLY = [ + "Adm.", + "Dr.", + "e.g.", + "E.g.", + "E.G.", + "Gen.", + "Gov.", + "i.e.", + "I.e.", + "I.E.", + "Jr.", + "Ltd.", + "p.m.", + "Ph.D.", + "Rep.", + "Rev.", + "Sen.", + "Sr.", + "Sra.", + "vs.", +] From d4a59c254bd30a83748586a85850e0cd351b20d9 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:01 +0200 Subject: [PATCH 08/21] Remove whitespace --- spacy/lexeme.pyx | 30 +++++++++++++++--------------- spacy/tokens/span.pyx | 26 +++++++++++++------------- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 1d5421d74..84338e281 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -87,7 +87,7 @@ cdef class Lexeme: value (bool): The new value of the flag. """ Lexeme.c_set_flag(self.c, flag_id, value) - + def check_flag(self, attr_id_t flag_id): """Check the value of a boolean flag. @@ -137,7 +137,7 @@ cdef class Lexeme: "\npython -m spacy.%s.download all\n" "to install the data." % self.vocab.lang ) - + vector_view = self.c.vector return numpy.asarray(vector_view) @@ -163,7 +163,7 @@ cdef class Lexeme: return self.c.sentiment def __set__(self, float sentiment): self.c.sentiment = sentiment - + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] @@ -171,7 +171,7 @@ cdef class Lexeme: property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x - + property norm: def __get__(self): return self.c.norm def __set__(self, int x): self.c.norm = x @@ -187,11 +187,11 @@ cdef class Lexeme: property suffix: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - + property cluster: def __get__(self): return self.c.cluster def __set__(self, int x): self.c.cluster = x - + property lang: def __get__(self): return self.c.lang def __set__(self, int x): self.c.lang = x @@ -203,11 +203,11 @@ cdef class Lexeme: property lower_: def __get__(self): return self.vocab.strings[self.c.lower] def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] - + property norm_: def __get__(self): return self.vocab.strings[self.c.norm] def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] - + property shape_: def __get__(self): return self.vocab.strings[self.c.shape] def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] @@ -239,7 +239,7 @@ cdef class Lexeme: property is_alpha: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ALPHA) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ALPHA, x) - + property is_ascii: def __get__(self): return Lexeme.c_check_flag(self.c, IS_ASCII) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_ASCII, x) @@ -260,23 +260,23 @@ cdef class Lexeme: def __get__(self): return Lexeme.c_check_flag(self.c, IS_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_PUNCT, x) - property is_space: + property is_space: def __get__(self): return Lexeme.c_check_flag(self.c, IS_SPACE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_SPACE, x) - property is_bracket: + property is_bracket: def __get__(self): return Lexeme.c_check_flag(self.c, IS_BRACKET) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_BRACKET, x) - property is_quote: + property is_quote: def __get__(self): return Lexeme.c_check_flag(self.c, IS_QUOTE) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_QUOTE, x) - property is_left_punct: + property is_left_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_LEFT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_LEFT_PUNCT, x) - property is_right_punct: + property is_right_punct: def __get__(self): return Lexeme.c_check_flag(self.c, IS_RIGHT_PUNCT) def __set__(self, bint x): Lexeme.c_set_flag(self.c, IS_RIGHT_PUNCT, x) @@ -284,7 +284,7 @@ cdef class Lexeme: property like_url: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_URL) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_URL, x) - + property like_num: def __get__(self): return Lexeme.c_check_flag(self.c, LIKE_NUM) def __set__(self, bint x): Lexeme.c_set_flag(self.c, LIKE_NUM, x) diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index fc5d26174..37d99183c 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -128,13 +128,13 @@ cdef class Span: end = token_by_end(self.doc.c, self.doc.length, self.end_char) if end == -1: raise IndexError("Error calculating span: Can't find end") - + self.start = start self.end = end + 1 property sent: '''The sentence span that this span is a part of. - + Returns: Span The sentence this is part of. ''' @@ -157,7 +157,7 @@ cdef class Span: if 'has_vector' in self.doc.user_span_hooks: return self.doc.user_span_hooks['has_vector'](self) return any(token.has_vector for token in self) - + property vector: def __get__(self): if 'vector' in self.doc.user_span_hooks: @@ -200,9 +200,9 @@ cdef class Span: property noun_chunks: ''' Yields base noun-phrase #[code Span] objects, if the document - has been syntactically parsed. A base noun phrase, or - 'NP chunk', is a noun phrase that does not permit other NPs to - be nested within it – so no NP-level coordination, no prepositional + has been syntactically parsed. A base noun phrase, or + 'NP chunk', is a noun phrase that does not permit other NPs to + be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: ''' def __get__(self): @@ -227,19 +227,19 @@ cdef class Span: Returns: Token: The root token. - + i.e. has the shortest path to the root of the sentence (or is the root itself). If multiple words are equally high in the tree, the first word is taken. - + For example: - + >>> toks = nlp(u'I like New York in Autumn.') Let's name the indices --- easier than writing "toks[4]" etc. - >>> i, like, new, york, in_, autumn, dot = range(len(toks)) + >>> i, like, new, york, in_, autumn, dot = range(len(toks)) The head of 'new' is 'York', and the head of 'York' is 'like' @@ -301,10 +301,10 @@ cdef class Span: return self.doc[self.start] else: return self.doc[root] - + property lefts: """Tokens that are to the left of the span, whose head is within the Span. - + Yields: Token A left-child of a token of the span. """ def __get__(self): @@ -315,7 +315,7 @@ cdef class Span: property rights: """Tokens that are to the right of the Span, whose head is within the Span. - + Yields: Token A right-child of a token of the span. """ def __get__(self): From 42382d56926ac545b47dfb5e9abcdbb4b7e469f8 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:19:32 +0200 Subject: [PATCH 09/21] Fix download commands in error messages (see #946) --- spacy/lexeme.pyx | 2 +- spacy/tokens/doc.pyx | 4 ++-- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pyx | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 84338e281..3a26161bb 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -134,7 +134,7 @@ cdef class Lexeme: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index bda528383..2e1481d1b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -394,7 +394,7 @@ cdef class Doc: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. @@ -427,7 +427,7 @@ cdef class Doc: raise ValueError( "sentence boundary detection requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) cdef int i start = 0 diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 37d99183c..f43d47876 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -210,7 +210,7 @@ cdef class Span: raise ValueError( "noun_chunks requires the dependency parse, which " "requires data to be installed. If you haven't done so, run: " - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data" % self.vocab.lang) # Accumulate the result before beginning to iterate over it. This prevents # the tokenisation from being changed out from under us during the iteration. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 34de9dee7..b8e470437 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -236,7 +236,7 @@ cdef class Token: raise ValueError( "Word vectors set to length 0. This may be because the " "data is not installed. If you haven't already, run" - "\npython -m spacy.%s.download all\n" + "\npython -m spacy download %s\n" "to install the data." % self.vocab.lang ) vector_view = self.c.lex.vector From ad8bf1829f72e0bf769081c7f9a8008a82df7504 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 1 Apr 2017 10:37:42 +0200 Subject: [PATCH 10/21] Import and combine Portuguese tokenizer exceptions (see #943) --- spacy/pt/language_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index f9899d8d1..d96cdd38f 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -5,13 +5,15 @@ from .. import language_data as base from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS - +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] From 2de2195be887acb25e9a364293842e87ea1ee9ba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 1 Apr 2017 10:39:42 +0200 Subject: [PATCH 11/21] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 27d280785..53807208c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -33,6 +33,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Matthew Honnibal, [@honnibal](https://github.com/honnibal) * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Michael Wallin, [@wallinm1](https://github.com/wallinm1) +* Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) From 2c36a61ec531c570d51e2afbe47bce0f5364cfd6 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 3 Apr 2017 18:12:38 +0200 Subject: [PATCH 12/21] Add spacyr to libraries --- website/docs/usage/_data.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c219585f9..c8c85af1d 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -151,6 +151,11 @@ "url": "https://github.com/golastmile/rasa_nlu", "author": "LASTMILE", "description": "High level APIs for building your own language parser using existing NLP and ML libraries." + }, + "spacyr": { + "url": "https://github.com/kbenoit/spacyr", + "author": "Kenneth Benoit", + "description": "An R wrapper for spaCy." } }, "visualizations": { From 808cd6cf7f184e20d9b8e42364f7e10f045028dc Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 3 Apr 2017 18:12:52 +0200 Subject: [PATCH 13/21] Add missing tags to verbs (resolves #948) --- spacy/en/tokenizer_exceptions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 07b01c4fb..3d009241b 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -213,15 +213,15 @@ for verb_data in [ {ORTH: "does", LEMMA: "do"}, {ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "may"}, - {ORTH: "might"}, - {ORTH: "must"}, + {ORTH: "may", TAG: "MD"}, + {ORTH: "might", TAG: "MD"}, + {ORTH: "must", TAG: "MD"}, {ORTH: "need"}, {ORTH: "ought"}, - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "should"}, - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "would"} + {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, + {ORTH: "should", TAG: "MD"}, + {ORTH: "wo", LEMMA: "will", TAG: "MD"}, + {ORTH: "would", TAG: "MD"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() From 10e8dcdfdb7acd7ecb2c19d3d522c73bca5d7de3 Mon Sep 17 00:00:00 2001 From: Kumaran Rajendhiran Date: Wed, 5 Apr 2017 16:20:47 +0530 Subject: [PATCH 14/21] Remove not needed parameters from function --- examples/keras_parikh_entailment/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py index 5c3132bab..08bccb648 100644 --- a/examples/keras_parikh_entailment/__main__.py +++ b/examples/keras_parikh_entailment/__main__.py @@ -52,7 +52,7 @@ def train(train_loc, dev_loc, shape, settings): file_.write(model.to_json()) -def evaluate(model_dir, dev_loc): +def evaluate(dev_loc): dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) nlp = spacy.load('en', create_pipeline=create_similarity_pipeline) From 47d7137c837de33207230cb780ab369019f50ff8 Mon Sep 17 00:00:00 2001 From: Kumaran Rajendhiran Date: Wed, 5 Apr 2017 16:48:35 +0530 Subject: [PATCH 15/21] Set max_length to 100 for demo and evaluate --- examples/keras_parikh_entailment/spacy_hook.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/keras_parikh_entailment/spacy_hook.py b/examples/keras_parikh_entailment/spacy_hook.py index 0177da001..c1e36327c 100644 --- a/examples/keras_parikh_entailment/spacy_hook.py +++ b/examples/keras_parikh_entailment/spacy_hook.py @@ -80,10 +80,10 @@ def get_word_ids(docs, rnn_encode=False, tree_truncate=False, max_length=100, nr return Xs -def create_similarity_pipeline(nlp): +def create_similarity_pipeline(nlp, max_length=100): return [ nlp.tagger, nlp.entity, nlp.parser, - KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length=10) + KerasSimilarityShim.load(nlp.path / 'similarity', nlp, max_length) ] From 3f55d6afaea3e02662d42f0993933c688f3b8438 Mon Sep 17 00:00:00 2001 From: Kumaran Rajendhiran Date: Wed, 5 Apr 2017 16:59:52 +0530 Subject: [PATCH 16/21] Update README --- examples/keras_parikh_entailment/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/keras_parikh_entailment/README.md b/examples/keras_parikh_entailment/README.md index 571ba6e73..adc80ce89 100644 --- a/examples/keras_parikh_entailment/README.md +++ b/examples/keras_parikh_entailment/README.md @@ -78,7 +78,7 @@ You can run the `keras_parikh_entailment/` directory as a script, which executes [`keras_parikh_entailment/__main__.py`](__main__.py). The first thing you'll want to do is train the model: ```bash -python keras_parikh_entailment/ train +python keras_parikh_entailment/ train ``` Training takes about 300 epochs for full accuracy, and I haven't rerun the full From 010293fb2f0ce0841f3eb4ebe7e53e4d0ec0133a Mon Sep 17 00:00:00 2001 From: oeg Date: Thu, 6 Apr 2017 17:33:15 +0200 Subject: [PATCH 17/21] fix(typo): Fixes typo in method calling PseudoProjectivity.deprojectivize, failing with new train cli --- spacy/language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 920a4c4c8..25bfb9e08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,7 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.tagger) if nlp.parser: pipeline.append(nlp.parser) - pipeline.append(Pseudoprojectivity.deprojectivize) + pipeline.append(PseudoProjectivity.deprojectivize) if nlp.entity: pipeline.append(nlp.entity) return pipeline From 75f9b4c6e288f93513c85617a79fd6b5bd84ac2a Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:21:29 +0200 Subject: [PATCH 18/21] Fix whitespace --- spacy/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 6c25ce0e8..2d9812839 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,6 @@ import re import os.path import pathlib import sys - import textwrap From d6bbc3ffcdb4e7e3ba88abf41798c534fab449b1 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:21:43 +0200 Subject: [PATCH 19/21] Fix formatting --- website/docs/usage/troubleshooting.jade | 1 - 1 file changed, 1 deletion(-) diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index 06454b055..8af611859 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -33,7 +33,6 @@ p | import the language's #[code Language] class instead, for example | #[code from spacy.fr import French]. - +h(3, "symlink-privilege") Symbolic link privilege not held +code(false, "text"). From f33c4cbae11354133acceb2be655d946b7c3b5f8 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 7 Apr 2017 10:22:06 +0200 Subject: [PATCH 20/21] Add --no-cache-dir error to troubleshooting docs (see #958) --- website/docs/usage/troubleshooting.jade | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index 8af611859..cb8271343 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -50,6 +50,20 @@ p | or use a #[code virtualenv] to install spaCy in a user directory, instead | of doing a system-wide installation. ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. + ++infobox("Solution") + | Run #[code pip install -U pip] to upgrade to the latest version of pip. + | To see which version you have installed, run #[code pip --version]. +h(3, "import-error") Import error From 2a60597089a24645140bbc336f18644fca2b1f8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 7 Apr 2017 13:34:05 +0200 Subject: [PATCH 21/21] Update CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 53807208c..d40dedf55 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -12,6 +12,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Christoph Schwienheer, [@chssch](https://github.com/chssch) * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk) * Daniel Rapp, [@rappdw](https://github.com/rappdw) +* Daniel Vila Suero, [@dvsrepo](https://github.com/dvsrepo) * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) * Greg Baker, [@solresol](https://github.com/solresol)