diff --git a/.travis.yml b/.travis.yml index 55c080d1d..acdf637d1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,9 +22,9 @@ install: script: - "pip install pytest" - - if [[ "${VIA}" == "compile" ]]; then python -m pytest spacy; fi - - if [[ "${VIA}" == "pypi" ]]; then python -m pytest `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi - - if [[ "${VIA}" == "sdist" ]]; then python -m pytest `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "compile" ]]; then python -m pytest --tb=native spacy; fi + - if [[ "${VIA}" == "pypi" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(ospath.dirname(spacy.__file__)))"`; fi + - if [[ "${VIA}" == "sdist" ]]; then python -m pytest --tb=native `python -c "import os.path; import spacy; print(os.path.abspath(os.path.dirname(spacy.__file__)))"`; fi notifications: slack: diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 2f9b75936..53807208c 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -7,6 +7,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Andrew Poliakov, [@pavlin99th](https://github.com/pavlin99th) * Aniruddha Adhikary [@aniruddha-adhikary](https://github.com/aniruddha-adhikary) * Bhargav Srinivasa, [@bhargavvader](https://github.com/bhargavvader) +* Bruno P. Kinoshita, [@kinow](https://github.com/kinow) * Chris DuBois, [@chrisdubois](https://github.com/chrisdubois) * Christoph Schwienheer, [@chssch](https://github.com/chssch) * Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk) @@ -14,6 +15,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi) * Eric Zhao, [@ericzhao28](https://github.com/ericzhao28) * Greg Baker, [@solresol](https://github.com/solresol) +* Grégory Howard, [@Gregory-Howard](https://github.com/Gregory-Howard) * György Orosz, [@oroszgy](https://github.com/oroszgy) * Henning Peters, [@henningpeters](https://github.com/henningpeters) * Iddo Berger, [@iddoberger](https://github.com/iddoberger) @@ -21,6 +23,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * J Nicolas Schrading, [@NSchrading](https://github.com/NSchrading) * Janneke van der Zwaan, [@jvdzwaan](https://github.com/jvdzwaan) * Jordan Suchow, [@suchow](https://github.com/suchow) +* Josh Reeter, [@jreeter](https://github.com/jreeter) * Juan Miguel Cejuela, [@juanmirocks](https://github.com/juanmirocks) * Kendrick Tan, [@kendricktan](https://github.com/kendricktan) * Kyle P. Johnson, [@kylepjohnson](https://github.com/kylepjohnson) @@ -30,6 +33,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a * Matthew Honnibal, [@honnibal](https://github.com/honnibal) * Maxim Samsonov, [@maxirmx](https://github.com/maxirmx) * Michael Wallin, [@wallinm1](https://github.com/wallinm1) +* Miguel Almeida, [@mamoit](https://github.com/mamoit) * Oleg Zd, [@olegzd](https://github.com/olegzd) * Pokey Rule, [@pokey](https://github.com/pokey) * Raphaël Bournhonesque, [@raphael0202](https://github.com/raphael0202) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 3777fd85f..82d1d9a33 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -46,7 +46,7 @@ def symlink(model_path, link_name, force): # Add workaround for Python 2 on Windows (see issue #909) if util.is_python2() and util.is_windows(): import subprocess - command = ['mklink', '/d', link_path.as_posix(), model_path.as_posix()] + command = ['mklink', '/d', link_path, model_path] subprocess.call(command, shell=True) else: link_path.symlink_to(model_path) diff --git a/spacy/en/tokenizer_exceptions.py b/spacy/en/tokenizer_exceptions.py index 07b01c4fb..3d009241b 100644 --- a/spacy/en/tokenizer_exceptions.py +++ b/spacy/en/tokenizer_exceptions.py @@ -213,15 +213,15 @@ for verb_data in [ {ORTH: "does", LEMMA: "do"}, {ORTH: "did", LEMMA: "do", TAG: "VBD"}, {ORTH: "had", LEMMA: "have", TAG: "VBD"}, - {ORTH: "may"}, - {ORTH: "might"}, - {ORTH: "must"}, + {ORTH: "may", TAG: "MD"}, + {ORTH: "might", TAG: "MD"}, + {ORTH: "must", TAG: "MD"}, {ORTH: "need"}, {ORTH: "ought"}, - {ORTH: "sha", LEMMA: "shall"}, - {ORTH: "should"}, - {ORTH: "wo", LEMMA: "will"}, - {ORTH: "would"} + {ORTH: "sha", LEMMA: "shall", TAG: "MD"}, + {ORTH: "should", TAG: "MD"}, + {ORTH: "wo", LEMMA: "will", TAG: "MD"}, + {ORTH: "would", TAG: "MD"} ]: verb_data_tc = dict(verb_data) verb_data_tc[ORTH] = verb_data_tc[ORTH].title() diff --git a/spacy/language.py b/spacy/language.py index a90e580ca..25bfb9e08 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,6 +144,7 @@ class BaseDefaults(object): pipeline.append(nlp.tagger) if nlp.parser: pipeline.append(nlp.parser) + pipeline.append(PseudoProjectivity.deprojectivize) if nlp.entity: pipeline.append(nlp.entity) return pipeline diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 1883ae89a..c5e520656 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -161,6 +161,13 @@ def _convert_strings(token_specs, string_store): return tokens +def merge_phrase(matcher, doc, i, matches): + '''Callback to merge a phrase on match''' + ent_id, label, start, end = matches[i] + span = doc[start : end] + span.merge(ent_type=label, ent_id=ent_id) + + cdef class Matcher: '''Match sequences of tokens, based on pattern rules.''' cdef Pool mem diff --git a/spacy/pt/language_data.py b/spacy/pt/language_data.py index f9899d8d1..d96cdd38f 100644 --- a/spacy/pt/language_data.py +++ b/spacy/pt/language_data.py @@ -5,13 +5,15 @@ from .. import language_data as base from ..language_data import update_exc, strings_to_exc from .stop_words import STOP_WORDS - +from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, ORTH_ONLY STOP_WORDS = set(STOP_WORDS) -TOKENIZER_EXCEPTIONS = strings_to_exc(base.EMOTICONS) +TOKENIZER_EXCEPTIONS = dict(TOKENIZER_EXCEPTIONS) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(ORTH_ONLY)) update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.ABBREVIATIONS)) +update_exc(TOKENIZER_EXCEPTIONS, strings_to_exc(base.EMOTICONS)) __all__ = ["TOKENIZER_EXCEPTIONS", "STOP_WORDS"] diff --git a/spacy/pt/stop_words.py b/spacy/pt/stop_words.py index 311b82477..a24356881 100644 --- a/spacy/pt/stop_words.py +++ b/spacy/pt/stop_words.py @@ -3,18 +3,19 @@ from __future__ import unicode_literals STOP_WORDS = set(""" -à às acerca adeus agora ainda algmas algo algumas alguns ali além ambos ano -anos antes ao aos apenas apoio apontar após aquela aquelas aquele aqueles aqui -aquilo area área as assim através atrás até aí +à às acerca adeus agora ainda algo algumas alguns ali além ambas ambos ano +anos antes ao aos apenas apoio apoia apontar após aquela aquelas aquele aqueles +aqui aquilo área as assim através atrás até aí -baixo bastante bem bom breve +baixo bastante bem boa bom breve cada caminho catorze cedo cento certamente certeza cima cinco coisa com como -comprido conhecido conselho contra corrente custa cá +comprido comprida conhecida conhecido conselho contra corrente custa cá -da daquela daquele dar das de debaixo demais dentro depois desde desligado -dessa desse desta deste deve devem deverá dez dezanove dezasseis dezassete -dezoito dia diante direita diz dizem dizer do dois dos doze duas dá dão dúvida +da daquela daquele dar das de debaixo demais dentro depois desde desligada +desligado dessa desse desta deste deve devem deverá dez dezanove dezasseis +dezassete dezoito dia diante direita diz dizem dizer do dois dos doze duas dá +dão dúvida é ela elas ele eles em embora enquanto entre então era és essa essas esse esses esta estado estar estará estas estava este estes esteve estive estivemos @@ -27,7 +28,7 @@ geral grande grandes grupo hoje horas há -iniciar inicio ir irá isso ista iste isto já +iniciar inicio ir irá isso isto já lado ligado local logo longe lugar lá @@ -35,32 +36,53 @@ maior maioria maiorias mais mal mas me meio menor menos meses mesmo meu meus mil minha minhas momento muito muitos máximo mês na nada naquela naquele nas nem nenhuma nessa nesse nesta neste no noite nome -nos nossa nossas nosso nossos nova nove novo novos num numa nunca não nível nós -número +nos nossa nossas nosso nossos nova novas nove novo novos num numa nunca nuns +não nível nós número números obra obrigada obrigado oitava oitavo oito onde ontem onze os ou outra outras outro outros para parece parte partir pegar pela pelas pelo pelos perto pessoas pode podem poder poderá podia ponto pontos por porque porquê posição possivelmente posso -possível pouca pouco povo primeira primeiro promeiro próprio próximo puderam -pôde põe põem +possível pouca pouco povo primeira primeiro próprio próxima próximo puderam pôde +põe põem -qual qualquer quando quanto quarta quarto quatro que quem quer quero questão -quieto quinta quinto quinze quê relação +qual qualquer quando quanto quarta quarto quatro que quem quer querem quero +questão quieta quieto quinta quinto quinze quê + +relação sabe saber se segunda segundo sei seis sem sempre ser seria sete seu seus sexta sexto sim sistema sob sobre sois somente somos sou sua suas são sétima sétimo -tal talvez também tanto tarde te tem temos tempo tendes tenho tens tentar +tal talvez também tanta tanto tarde te tem temos tempo tendes tenho tens tentar tentaram tente tentei ter terceira terceiro teu teus teve tipo tive tivemos tiveram tiveste tivestes toda todas todo todos trabalhar trabalho treze três tu tua tuas tudo tão têm último um uma umas uns usa usar -vai vais valor veja vem vens ver verdade verdadeiro vez vezes viagem vindo -vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós +vai vais valor veja vem vens ver verdade verdadeira verdadeiro vez vezes viagem +vinda vindo vinte você vocês vos vossa vossas vosso vossos vários vão vêm vós zero """.split()) + + +# Number words + +NUM_WORDS = set(""" +zero um dois três quatro cinco seis sete oito nove dez onze doze treze catorze +quinze dezasseis dezassete dezoito dezanove vinte trinta quarenta cinquenta +sessenta setenta oitenta noventa cem mil milhão bilião trilião quadrilião +""".split()) + +# Ordinal words + +ORDINAL_WORDS = set(""" +primeiro segundo terceiro quarto quinto sexto sétimo oitavo nono décimo +vigésimo trigésimo quadragésimo quinquagésimo sexagésimo septuagésimo +octogésimo nonagésimo centésimo ducentésimo trecentésimo quadringentésimo +quingentésimo sexcentésimo septingentésimo octingentésimo nongentésimo +milésimo milionésimo bilionésimo +""".split()) diff --git a/spacy/pt/tokenizer_exceptions.py b/spacy/pt/tokenizer_exceptions.py new file mode 100644 index 000000000..1e02f6c6e --- /dev/null +++ b/spacy/pt/tokenizer_exceptions.py @@ -0,0 +1,111 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..symbols import * +from ..language_data import PRON_LEMMA + +TOKENIZER_EXCEPTIONS = {} + +# Contractions +CONTRACTIONS = {} + +personal_pronoun = ( + "ele", "ela", "eles", "elas" +) +demonstrative_pronouns = ( + "este", "esta", "estes", "estas", "isto", "esse", "essa", "esses", "essas", + "isso", "aquele", "aquela", "aqueles", "aquelas", "aquilo" +) +undefined_pronouns = ( + "outro", "outra", "outros", "outras" +) +adverbs = ( + "aqui", "aí", "ali", "além" +) + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns + adverbs: + CONTRACTIONS["d" + word] = [ + {ORTH: "d", NORM: "de"}, + {ORTH: word} + ] + +for word in personal_pronoun + demonstrative_pronouns + \ + undefined_pronouns: + CONTRACTIONS["n" + word] = [ + {ORTH: "n", NORM: "em"}, + {ORTH: word} + ] + +# Not so linear contractions "a"+something + +CONTRACTIONS.update({ + # This one cannot be split into 2 + # "à": [ + # {ORTH: "à", NORM: "a"}, + # {ORTH: "", NORM: "a"} + # ], + "às": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "s", NORM: "as"} + ], + "ao": [ + {ORTH: "a"}, + {ORTH: "o"} + ], + "aos": [ + {ORTH: "a"}, + {ORTH: "os"} + ], + "àquele": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quele", NORM: "aquele"} + ], + "àquela": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quela", NORM: "aquela"} + ], + "àqueles": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "queles", NORM: "aqueles"} + ], + "àquelas": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quelas", NORM: "aquelas"} + ], + "àquilo": [ + {ORTH: "à", NORM: "a"}, + {ORTH: "quilo", NORM: "aquilo"} + ], + "aonde": [ + {ORTH: "a"}, + {ORTH: "onde"} + ], +}) + +TOKENIZER_EXCEPTIONS.update(CONTRACTIONS) + +# Abbreviations with only one ORTH token + +ORTH_ONLY = [ + "Adm.", + "Dr.", + "e.g.", + "E.g.", + "E.G.", + "Gen.", + "Gov.", + "i.e.", + "I.e.", + "I.E.", + "Jr.", + "Ltd.", + "p.m.", + "Ph.D.", + "Rep.", + "Rev.", + "Sen.", + "Sr.", + "Sra.", + "vs.", +] diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index bea0c9b45..93bc21e22 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -427,8 +427,6 @@ cdef class ArcEager(TransitionSystem): def finalize_doc(self, doc): doc.is_parsed = True - if doc.vocab.lang == 'de': - PseudoProjectivity.deprojectivize(doc) cdef int set_valid(self, int* output, const StateC* st) nogil: cdef bint[N_MOVES] is_valid diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 764efea8b..123ae03da 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -257,6 +257,7 @@ cdef class Parser: cdef int parseC(self, TokenC* tokens, int length, int nr_feat) nogil: state = new StateC(tokens, length) # NB: This can change self.moves.n_moves! + # I think this causes memory errors if called by .pipe() self.moves.initialize_state(state) nr_class = self.moves.n_moves diff --git a/spacy/tests/regression/test_issue758.py b/spacy/tests/regression/test_issue758.py new file mode 100644 index 000000000..2ddba9975 --- /dev/null +++ b/spacy/tests/regression/test_issue758.py @@ -0,0 +1,17 @@ +from ... import load as load_spacy +from ...attrs import LEMMA +from ...matcher import merge_phrase + +import pytest + + + + +@pytest.mark.models +def test_issue758(): + '''Test parser transition bug after label added.''' + nlp = load_spacy('en') + nlp.matcher.add('splash', 'my_entity', {}, + [[{LEMMA: 'splash'}, {LEMMA: 'on'}]], + on_match=merge_phrase) + doc = nlp('splash On', parse=False) diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py index 86712f771..29cc917fe 100644 --- a/spacy/tests/spans/test_merge.py +++ b/spacy/tests/spans/test_merge.py @@ -19,6 +19,15 @@ def test_spans_merge_tokens(en_tokenizer): assert doc[0].text == 'Los Angeles' assert doc[0].head.text == 'start' + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=heads) + assert len(doc) == 4 + assert doc[0].head.text == 'Angeles' + assert doc[1].head.text == 'start' + doc.merge(0, len('Los Angeles'), tag='NNP', lemma='Los Angeles', label='GPE') + assert len(doc) == 3 + assert doc[0].text == 'Los Angeles' + assert doc[0].head.text == 'start' + assert doc[0].ent_type_ == 'GPE' def test_spans_merge_heads(en_tokenizer): text = "I found a pilates class near work." @@ -114,4 +123,4 @@ def test_spans_subtree_size_check(en_tokenizer): sent1 = list(doc.sents)[0] init_len = len(list(sent1.root.subtree)) doc[0:2].merge('none', 'none', 'none') - assert len(list(sent1.root.subtree)) == init_len - 1 + assert len(list(sent1.root.subtree)) == init_len - 1 \ No newline at end of file diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 2e1481d1b..d59317747 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -667,6 +667,16 @@ cdef class Doc: attributes[TAG] = self.vocab.strings[tag] attributes[LEMMA] = self.vocab.strings[lemma] attributes[ENT_TYPE] = self.vocab.strings[ent_type] + elif not args: + # TODO: This code makes little sense overall. We're still + # ignoring most of the attributes? + if "label" in attributes and 'ent_type' not in attributes: + if type(attributes["label"]) == int: + attributes[ENT_TYPE] = attributes["label"] + else: + attributes[ENT_TYPE] = self.vocab.strings[attributes["label"]] + if 'ent_type' in attributes: + attributes[ENT_TYPE] = attributes['ent_type'] elif args: raise ValueError( "Doc.merge received %d non-keyword arguments. " @@ -686,6 +696,9 @@ cdef class Doc: tag = self.vocab.strings[attributes.get(TAG, span.root.tag)] lemma = self.vocab.strings[attributes.get(LEMMA, span.root.lemma)] ent_type = self.vocab.strings[attributes.get(ENT_TYPE, span.root.ent_type)] + ent_id = attributes.get('ent_id', span.root.ent_id) + if isinstance(ent_id, basestring): + ent_id = self.vocab.strings[ent_id] # Get LexemeC for newly merged token new_orth = ''.join([t.text_with_ws for t in span]) @@ -706,6 +719,7 @@ cdef class Doc: else: token.ent_iob = 3 token.ent_type = self.vocab.strings[ent_type] + token.ent_id = ent_id # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets # Before thinking of something simpler, beware the case where a dependency diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index b8e470437..a89b35eee 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -506,21 +506,15 @@ cdef class Token: return self.c.ent_id def __set__(self, hash_t key): - # TODO - raise NotImplementedError( - "Can't yet set ent_id from Token. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + self.c.ent_id = key property ent_id_: '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' def __get__(self): return self.vocab.strings[self.c.ent_id] - def __set__(self, hash_t key): - # TODO - raise NotImplementedError( - "Can't yet set ent_id_ from Token. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + def __set__(self, name): + self.c.ent_id = self.vocab.strings[name] property whitespace_: def __get__(self): diff --git a/spacy/util.py b/spacy/util.py index 6c25ce0e8..2d9812839 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -7,7 +7,6 @@ import re import os.path import pathlib import sys - import textwrap diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c219585f9..c8c85af1d 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -151,6 +151,11 @@ "url": "https://github.com/golastmile/rasa_nlu", "author": "LASTMILE", "description": "High level APIs for building your own language parser using existing NLP and ML libraries." + }, + "spacyr": { + "url": "https://github.com/kbenoit/spacyr", + "author": "Kenneth Benoit", + "description": "An R wrapper for spaCy." } }, "visualizations": { diff --git a/website/docs/usage/troubleshooting.jade b/website/docs/usage/troubleshooting.jade index 06454b055..cb8271343 100644 --- a/website/docs/usage/troubleshooting.jade +++ b/website/docs/usage/troubleshooting.jade @@ -33,7 +33,6 @@ p | import the language's #[code Language] class instead, for example | #[code from spacy.fr import French]. - +h(3, "symlink-privilege") Symbolic link privilege not held +code(false, "text"). @@ -51,6 +50,20 @@ p | or use a #[code virtualenv] to install spaCy in a user directory, instead | of doing a system-wide installation. ++h(3, "no-cache-dir") No such option: --no-cache-dir + ++code(false, "text"). + no such option: --no-cache-dir + +p + | The #[code download] command uses pip to install the models and sets the + | #[code --no-cache-dir] flag to prevent it from requiring too much memory. + | #[+a("https://pip.pypa.io/en/stable/reference/pip_install/#caching") This setting] + | requires pip v6.0 or newer. + ++infobox("Solution") + | Run #[code pip install -U pip] to upgrade to the latest version of pip. + | To see which version you have installed, run #[code pip --version]. +h(3, "import-error") Import error