diff --git a/.github/contributors/sorenlind.md b/.github/contributors/sorenlind.md new file mode 100644 index 000000000..73e42636f --- /dev/null +++ b/.github/contributors/sorenlind.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Søren Lind Kristiansen | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 24 November 2017 | +| GitHub username | sorenlind | +| Website (optional) | | diff --git a/spacy/lang/da/tokenizer_exceptions.py b/spacy/lang/da/tokenizer_exceptions.py index c67c038bf..584ccf6f9 100644 --- a/spacy/lang/da/tokenizer_exceptions.py +++ b/spacy/lang/da/tokenizer_exceptions.py @@ -1,32 +1,134 @@ # encoding: utf8 +""" +Tokenizer Exceptions. +Source: https://forkortelse.dk/ and various others. +""" + from __future__ import unicode_literals -from ...symbols import ORTH, LEMMA, NORM +from ...symbols import ORTH, LEMMA, NORM, TAG, PUNCT _exc = {} +# Abbreviations for weekdays "søn." (for "søndag") as well as "Tor." and "Tors." +# (for "torsdag") are left out because they are ambiguous. The same is the case +# for abbreviations "jul." and "Jul." ("juli"). for exc_data in [ - {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, - {ORTH: "Jan.", LEMMA: "januar", NORM: "januar"}, - {ORTH: "Feb.", LEMMA: "februar", NORM: "februar"}, - {ORTH: "Mar.", LEMMA: "marts", NORM: "marts"}, - {ORTH: "Apr.", LEMMA: "april", NORM: "april"}, - {ORTH: "Maj.", LEMMA: "maj", NORM: "maj"}, - {ORTH: "Jun.", LEMMA: "juni", NORM: "juni"}, - {ORTH: "Jul.", LEMMA: "juli", NORM: "juli"}, - {ORTH: "Aug.", LEMMA: "august", NORM: "august"}, - {ORTH: "Sep.", LEMMA: "september", NORM: "september"}, - {ORTH: "Okt.", LEMMA: "oktober", NORM: "oktober"}, - {ORTH: "Nov.", LEMMA: "november", NORM: "november"}, - {ORTH: "Dec.", LEMMA: "december", NORM: "december"}]: + {ORTH: "Kbh.", LEMMA: "København", NORM: "København"}, + {ORTH: "jan.", LEMMA: "januar"}, + {ORTH: "febr.", LEMMA: "februar"}, + {ORTH: "feb.", LEMMA: "februar"}, + {ORTH: "mar.", LEMMA: "marts"}, + {ORTH: "apr.", LEMMA: "april"}, + {ORTH: "jun.", LEMMA: "juni"}, + {ORTH: "aug.", LEMMA: "august"}, + {ORTH: "sept.", LEMMA: "september"}, + {ORTH: "sep.", LEMMA: "september"}, + {ORTH: "okt.", LEMMA: "oktober"}, + {ORTH: "nov.", LEMMA: "november"}, + {ORTH: "dec.", LEMMA: "december"}, + {ORTH: "man.", LEMMA: "mandag"}, + {ORTH: "tirs.", LEMMA: "tirsdag"}, + {ORTH: "ons.", LEMMA: "onsdag"}, + {ORTH: "tor.", LEMMA: "torsdag"}, + {ORTH: "tors.", LEMMA: "torsdag"}, + {ORTH: "fre.", LEMMA: "fredag"}, + {ORTH: "lør.", LEMMA: "lørdag"}, + {ORTH: "Jan.", LEMMA: "januar"}, + {ORTH: "Febr.", LEMMA: "februar"}, + {ORTH: "Feb.", LEMMA: "februar"}, + {ORTH: "Mar.", LEMMA: "marts"}, + {ORTH: "Apr.", LEMMA: "april"}, + {ORTH: "Jun.", LEMMA: "juni"}, + {ORTH: "Aug.", LEMMA: "august"}, + {ORTH: "Sept.", LEMMA: "september"}, + {ORTH: "Sep.", LEMMA: "september"}, + {ORTH: "Okt.", LEMMA: "oktober"}, + {ORTH: "Nov.", LEMMA: "november"}, + {ORTH: "Dec.", LEMMA: "december"}, + {ORTH: "Man.", LEMMA: "mandag"}, + {ORTH: "Tirs.", LEMMA: "tirsdag"}, + {ORTH: "Ons.", LEMMA: "onsdag"}, + {ORTH: "Fre.", LEMMA: "fredag"}, + {ORTH: "Lør.", LEMMA: "lørdag"}]: _exc[exc_data[ORTH]] = [exc_data] for orth in [ - "A/S", "beg.", "bl.a.", "ca.", "d.s.s.", "dvs.", "f.eks.", "fr.", "hhv.", - "if.", "iflg.", "m.a.o.", "mht.", "min.", "osv.", "pga.", "resp.", "self.", - "t.o.m.", "vha.", ""]: + "A.D.", "A/S", "aarh.", "ac.", "adj.", "adr.", "adsk.", "adv.", "afb.", + "afd.", "afg.", "afk.", "afs.", "aht.", "alg.", "alk.", "alm.", "amer.", + "ang.", "ank.", "anl.", "anv.", "arb.", "arr.", "att.", "B.C.", "bd.", + "bdt.", "beg.", "begr.", "beh.", "bet.", "bev.", "bhk.", "bib.", + "bibl.", "bidr.", "bildl.", "bill.", "bio.", "biol.", "bk.", "BK.", + "bl.", "bl.a.", "borgm.", "bot.", "Boul.", "br.", "brolægn.", "bto.", + "bygn.", "ca.", "cand.", "Chr.", "d.", "d.d.", "d.m.", "d.s.", "d.s.s.", + "d.y.", "d.å.", "d.æ.", "da.", "dagl.", "dat.", "dav.", "def.", "dek.", + "dep.", "desl.", "diam.", "dir.", "disp.", "distr.", "div.", "dkr.", + "dl.", "do.", "dobb.", "Dr.", "dr.h.c", "Dronn.", "ds.", "dvs.", "e.b.", + "e.l.", "e.o.", "e.v.t.", "eftf.", "eftm.", "eg.", "egl.", "eks.", + "eksam.", "ekskl.", "eksp.", "ekspl.", "el.", "el.lign.", "emer.", + "endv.", "eng.", "enk.", "etc.", "etym.", "eur.", "evt.", "exam.", "f.", + "f.eks.", "f.m.", "f.n.", "f.o.", "f.o.m.", "f.s.v.", "f.t.", "f.v.t.", + "f.å.", "fa.", "fakt.", "fam.", "fem.", "ff.", "fg.", "fhv.", "fig.", + "filol.", "filos.", "fl.", "flg.", "fm.", "fmd.", "fol.", "forb.", + "foreg.", "foren.", "forf.", "fork.", "form.", "forr.", "fors.", + "forsk.", "forts.", "fr.", "fr.u.", "frk.", "fsva.", "fuldm.", "fung.", + "fx.", "fys.", "fær.", "g.d.", "g.m.", "gd.", "gdr.", "genuds.", "gl.", + "gn.", "gns.", "gr.", "grdl.", "gross.", "h.a.", "h.c.", "H.K.H.", + "H.M.", "hdl.", "henv.", "Hf.", "hhv.", "hj.hj.", "hj.spl.", "hort.", + "hosp.", "hpl.", "Hr.", "hr.", "hrs.", "hum.", "hvp.", "i/s", "I/S", + "i.e.", "ib.", "id.", "if.", "iflg.", "ifm.", "ift.", "iht.", "ill.", + "indb.", "indreg.", "inf.", "ing.", "inh.", "inj.", "inkl.", "insp.", + "instr.", "isl.", "istf.", "it.", "ital.", "iv.", "jap.", "jf.", "jfr.", + "jnr.", "j.nr.", "jr.", "jur.", "jvf.", "K.", "kap.", "kat.", "kbh.", + "kem.", "kgl.", "kl.", "kld.", "knsp.", "komm.", "kons.", "korr.", + "kp.", "Kprs.", "kr.", "kst.", "kt.", "ktr.", "kv.", "kvt.", "l.", + "L.A.", "l.c.", "lab.", "lat.", "lb.m.", "lb.nr.", "lejl.", "lgd.", + "lic.", "lign.", "lin.", "ling.merc.", "litt.", "Ll.", "loc.cit.", + "lok.", "lrs.", "ltr.", "m/s", "M/S", "m.a.o.", "m.fl.", "m.m.", "m.v.", + "m.v.h.", "Mag.", "maks.", "md.", "mdr.", "mdtl.", "mezz.", "mfl.", + "m.h.p.", "m.h.t", "mht.", "mik.", "min.", "mio.", "modt.", "Mr.", + "mrk.", "mul.", "mv.", "n.br.", "n.f.", "nat.", "nb.", "Ndr.", + "nedenst.", "nl.", "nr.", "Nr.", "nto.", "nuv.", "o/m", "o.a.", "o.fl.", + "o.h.", "o.l.", "o.lign.", "o.m.a.", "o.s.fr.", "obl.", "obs.", + "odont.", "oecon.", "off.", "ofl.", "omg.", "omkr.", "omr.", "omtr.", + "opg.", "opl.", "opr.", "org.", "orig.", "osv.", "ovenst.", "overs.", + "ovf.", "p.", "p.a.", "p.b.a", "p.b.v", "p.c.", "p.m.", "p.m.v.", + "p.n.", "p.p.", "p.p.s.", "p.s.", "p.t.", "p.v.a.", "p.v.c.", "pag.", + "par.", "Pas.", "pass.", "pcs.", "pct.", "pd.", "pens.", "pers.", + "pft.", "pg.", "pga.", "pgl.", "Ph.d.", "pinx.", "pk.", "pkt.", + "polit.", "polyt.", "pos.", "pp.", "ppm.", "pr.", "prc.", "priv.", + "prod.", "prof.", "pron.", "Prs.", "præd.", "præf.", "præt.", "psych.", + "pt.", "pæd.", "q.e.d.", "rad.", "Rcp.", "red.", "ref.", "reg.", + "regn.", "rel.", "rep.", "repr.", "resp.", "rest.", "rm.", "rtg.", + "russ.", "s.", "s.br.", "s.d.", "s.f.", "s.m.b.a.", "s.u.", "s.å.", + "sa.", "sb.", "sc.", "scient.", "scil.", "Sdr.", "sek.", "sekr.", + "self.", "sem.", "sen.", "shj.", "sign.", "sing.", "sj.", "skr.", + "Skt.", "slutn.", "sml.", "smp.", "sms.", "snr.", "soc.", "soc.dem.", + "sort.", "sp.", "spec.", "Spl.", "spm.", "spr.", "spsk.", "statsaut.", + "st.", "stk.", "str.", "stud.", "subj.", "subst.", "suff.", "sup.", + "suppl.", "sv.", "såk.", "sædv.", "sø.", "t/r", "t.", "t.h.", "t.o.", + "t.o.m.", "t.v.", "tab.", "tbl.", "tcp/ip", "td.", "tdl.", "tdr.", + "techn.", "tekn.", "temp.", "th.", "theol.", "ti.", "tidl.", "tilf.", + "tilh.", "till.", "tilsv.", "tjg.", "tkr.", "tlf.", "tlgr.", "to.", + "tr.", "trp.", "tsk.", "tv.", "ty.", "u/b", "udb.", "udbet.", "ugtl.", + "undt.", "v.", "v.f.", "var.", "vb.", "vedk.", "vedl.", "vedr.", + "vejl.", "Vg.", "vh.", "vha.", "vs.", "vsa.", "vær.", "zool.", "ø.lgd.", + "øv.", "øvr.", "årg.", "årh.", ""]: _exc[orth] = [{ORTH: orth}] +# Dates +for h in range(1, 31 + 1): + for period in ["."]: + _exc["%d%s" % (h, period)] = [ + {ORTH: "%d." % h}] + +_custom_base_exc = { + "i.": [ + {ORTH: "i", LEMMA: "i", NORM: "i"}, + {ORTH: ".", TAG: PUNCT}] +} +_exc.update(_custom_base_exc) + TOKENIZER_EXCEPTIONS = _exc diff --git a/spacy/tests/lang/da/test_exceptions.py b/spacy/tests/lang/da/test_exceptions.py index d89fafd2c..71e34fc5c 100644 --- a/spacy/tests/lang/da/test_exceptions.py +++ b/spacy/tests/lang/da/test_exceptions.py @@ -3,13 +3,31 @@ from __future__ import unicode_literals import pytest -@pytest.mark.parametrize('text', ["ca.", "m.a.o.", "Jan.", "Dec."]) +@pytest.mark.parametrize('text', + ["ca.", "m.a.o.", "Jan.", "Dec.", "kr.", "jf."]) def test_da_tokenizer_handles_abbr(da_tokenizer, text): tokens = da_tokenizer(text) assert len(tokens) == 1 +@pytest.mark.parametrize('text', ["Jul.", "jul.", "Tor.", "Tors."]) +def test_da_tokenizer_handles_ambiguous_abbr(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 2 + +@pytest.mark.parametrize('text', ["1.", "10.", "31."]) +def test_da_tokenizer_handles_dates(da_tokenizer, text): + tokens = da_tokenizer(text) + assert len(tokens) == 1 + def test_da_tokenizer_handles_exc_in_text(da_tokenizer): text = "Det er bl.a. ikke meningen" tokens = da_tokenizer(text) assert len(tokens) == 5 assert tokens[2].text == "bl.a." + +def test_da_tokenizer_handles_custom_base_exc(da_tokenizer): + text = "Her er noget du kan kigge i." + tokens = da_tokenizer(text) + assert len(tokens) == 8 + assert tokens[6].text == "i" + assert tokens[7].text == "."