This commit is contained in:
Matthew Honnibal 2017-03-11 11:13:37 -06:00
commit ea2592879f
5 changed files with 30 additions and 16 deletions

View File

@ -41,7 +41,7 @@ To distinguish issues that are opened by us, the maintainers, we usually add a
| [`duplicate`](https://github.com/explosion/spaCy/labels/duplicate) | Duplicates, i.e. issues that have been reported before |
| [`meta`](https://github.com/explosion/spaCy/labels/meta) | Meta topics, e.g. repo organisation and issue management |
| [`help wanted`](https://github.com/explosion/spaCy/labels/help%20wanted) | Requests for contributions |
| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for begginners |
| [`help wanted (easy)`](https://github.com/explosion/spaCy/labels/help%20wanted%20%28easy%29) | Requests for contributions suitable for beginners |
## Contributing to the code base

View File

@ -9,6 +9,7 @@ This is a list of everyone who has made significant contributions to spaCy, in a
* Chris DuBois, [@chrisdubois](https://github.com/chrisdubois)
* Christoph Schwienheer, [@chssch](https://github.com/chssch)
* Dafne van Kuppevelt, [@dafnevk](https://github.com/dafnevk)
* Daniel Rapp, [@rappdw](https://github.com/rappdw)
* Dmytro Sadovnychyi, [@sadovnychyi](https://github.com/sadovnychyi)
* György Orosz, [@oroszgy](https://github.com/oroszgy)
* Henning Peters, [@henningpeters](https://github.com/henningpeters)

View File

@ -27,10 +27,21 @@ ABBREVIATIONS = {
"সে.": [
{ORTH: "সে.", LEMMA: "সেলসিয়াস"},
],
"কি.মি": [
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
"কি.মি.": [
{ORTH: "কি.মি.", LEMMA: "কিলোমিটার"},
],
"কি.মি": [
{ORTH: "কি.মি", LEMMA: "কিলোমিটার"},
],
"সে.মি.": [
{ORTH: "সে.মি.", LEMMA: "সেন্টিমিটার"},
],
"সে.মি": [
{ORTH: "সে.মি", LEMMA: "সেন্টিমিটার"},
],
"মি.লি.": [
{ORTH: "মি.লি.", LEMMA: "মিলিলিটার"},
]
}
TOKENIZER_EXCEPTIONS.update(ABBREVIATIONS)

View File

@ -45,6 +45,6 @@ _URL_PATTERN = (
r"$"
).strip()
TOKEN_MATCH = re.compile(_URL_PATTERN).match
TOKEN_MATCH = re.compile(_URL_PATTERN, re.UNICODE).match
__all__ = ['TOKEN_MATCH']

View File

@ -21,11 +21,8 @@ URLS_FULL = URLS_BASIC + [
URLS_SHOULD_MATCH = [
"http://foo.com/blah_blah",
"http://foo.com/blah_blah/",
# "http://foo.com/blah_blah_(wikipedia)",
# "http://foo.com/blah_blah_(wikipedia)_(again)",
"http://www.example.com/wpstyle/?p=364",
"https://www.example.com/foo/?bar=baz&inga=42&quux",
"http://✪df.ws/123",
"http://userid:password@example.com:8080",
"http://userid:password@example.com:8080/",
"http://userid@example.com",
@ -36,7 +33,6 @@ URLS_SHOULD_MATCH = [
"http://userid:password@example.com/",
"http://142.42.1.1/",
"http://142.42.1.1:8080/",
"http://➡.ws/䨹",
"http://⌘.ws",
"http://⌘.ws/",
"http://foo.com/blah_(wikipedia)#cite-1",
@ -48,13 +44,19 @@ URLS_SHOULD_MATCH = [
"http://j.mp",
"ftp://foo.bar/baz",
"http://foo.bar/?q=Test%20URL-encoded%20stuff",
"http://مثال.إختبار",
"http://例子.测试",
# "http://उदाहरण.परीक्षा",
"http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
"http://1337.net",
"http://a.b-c.de",
"http://223.255.255.254",
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"http://✪df.ws/123",
"http://➡.ws/䨹",
"http://مثال.إختبار",
"http://例子.测试",
pytest.mark.xfail("http://उदाहरण.परीक्षा"),
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)"),
pytest.mark.xfail("http://foo.com/blah_blah_(wikipedia)_(again)"),
]
URLS_SHOULD_NOT_MATCH = [
@ -74,7 +76,6 @@ URLS_SHOULD_NOT_MATCH = [
"///a",
"///",
"http:///a",
# "foo.com",
"rdar://1234",
"h://test",
"http:// shouldfail.com",
@ -82,21 +83,22 @@ URLS_SHOULD_NOT_MATCH = [
"http://foo.bar/foo(bar)baz quux",
"ftps://foo.bar/",
"http://-error-.invalid/",
# "http://a.b--c.de/", (this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"http://-a.b.co",
"http://a.b-.co",
"http://0.0.0.0",
"http://10.1.1.0",
"http://10.1.1.255",
"http://224.1.1.1",
# "http://1.1.1.1.1",
"http://123.123.123",
"http://3628126748",
"http://.www.foo.bar/",
# "http://www.foo.bar./",
"http://.www.foo.bar./",
"http://10.1.1.1",
"NASDAQ:GOOG"
"NASDAQ:GOOG",
pytest.mark.xfail("foo.com"),
pytest.mark.xfail("http://1.1.1.1.1"),
pytest.mark.xfail("http://www.foo.bar./"),
]