From d153f18969f3182eba8f0094376f144a4c1a9af5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 29 Jul 2015 22:36:03 +0200 Subject: [PATCH 001/138] * Fix negative indices on spans --- spacy/tokens/spans.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 2c37e9b85..716d85a1a 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -40,6 +40,8 @@ cdef class Span: return self.end - self.start def __getitem__(self, int i): + if i < 0: + i = len(self) - i return self._seq[self.start + i] def __iter__(self): From 74d8cb39804ae7b74f09fc8e40316c3e5c806038 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 02:29:49 +0200 Subject: [PATCH 002/138] * Add noun_chunks iterator, and fix left/right child setting in Doc.merge --- spacy/tokens/doc.pyx | 89 ++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index feb11bd87..ef901291d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES -from ..parts_of_speech cimport CONJ, PUNCT +from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..lexeme cimport check_flag from ..lexeme cimport get_attr as get_lex_attr -from .spans import Span +from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -154,6 +154,18 @@ cdef class Doc: if start != -1: yield Span(self, start, self.length, label=label) + @property + def noun_chunks(self): + """Yield spans for base noun phrases.""" + cdef const TokenC* word + labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj'] + np_deps = [self.vocab.strings[label] for label in labels] + np_label = self.vocab.strings['NP'] + for i in range(self.length): + word = &self.data[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(self, word.l_edge, i+1, label=np_label) + @property def sents(self): """ @@ -297,20 +309,7 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] - cdef TokenC* head - cdef TokenC* child - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head and child.l_edge < head.l_edge: - head.l_edge = child.l_edge - # Set right edges --- same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head and child.r_edge > head.r_edge: - head.r_edge = child.r_edge + set_children_from_heads(self.data, self.length) return self def to_bytes(self): @@ -354,9 +353,12 @@ cdef class Doc: break else: return None - cdef unicode string = self.string + + cdef Span span = self[start:end] # Get LexemeC for newly merged token - new_orth = string[start_idx:end_idx] + new_orth = ''.join([t.string for t in span]) + if span[-1].whitespace_: + new_orth = new_orth[:-1] cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] @@ -372,30 +374,16 @@ cdef class Doc: else: token.ent_iob = 3 token.ent_type = self.vocab.strings[ent_type] - # Fix dependencies # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets + # Before thinking of something simpler, beware the case where a dependency + # bridges over the entity. Here the alignment of the tokens changes. + span_root = span.root.i for i in range(self.length): self.data[i].head += i - # Find the head of the merged token, and its dep relation - outer_heads = {} - for i in range(start, end): - head_idx = self.data[i].head - if head_idx == i or head_idx < start or head_idx >= end: - # Don't consider "heads" which are actually dominated by a word - # in the region we're merging - gp = head_idx - while self.data[gp].head != gp: - if start <= gp < end: - break - gp = self.data[gp].head - else: - # If we have multiple words attaching to the same head, - # but with different dep labels, we're preferring the last - # occurring dep label. Shrug. What else could we do, I guess? - outer_heads[head_idx] = self.data[i].dep - - token.head, token.dep = max(outer_heads.items()) + # Set the head of the merged token, and its dep relation, from the Span + token.head = self.data[span_root].head + token.dep = span.root.dep # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end @@ -406,7 +394,6 @@ cdef class Doc: self.data[i].head = start elif head_idx >= end: self.data[i].head -= offset - # TODO: Fix left and right deps # Now compress the token array for i in range(end, self.length): self.data[i - offset] = self.data[i] @@ -417,6 +404,28 @@ cdef class Doc: for i in range(self.length): # ...And, set heads back to a relative position self.data[i].head -= i - + # Set the left/right children, left/right edges + set_children_from_heads(self.data, self.length) + # Clear the cached Python objects + self._py_tokens = [None] * self.length # Return the merged Python object return self[start] + + +cdef int set_children_from_heads(TokenC* tokens, int length) except -1: + cdef TokenC* head + cdef TokenC* child + cdef int i + # Set left edges + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head and child.l_edge < head.l_edge: + head.l_edge = child.l_edge + # Set right edges --- same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head and child.r_edge > head.r_edge: + head.r_edge = child.r_edge + From 9590968fc170f205f8e5b9a141e4e99308511b78 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 02:30:24 +0200 Subject: [PATCH 003/138] * Fix negative indices in Span --- spacy/tokens/spans.pyx | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index 716d85a1a..f1c19f308 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -41,13 +41,17 @@ cdef class Span: def __getitem__(self, int i): if i < 0: - i = len(self) - i - return self._seq[self.start + i] + return self._seq[self.end + i] + else: + return self._seq[self.start + i] def __iter__(self): for i in range(self.start, self.end): yield self._seq[i] + def merge(self, unicode tag, unicode lemma, unicode ent_type): + self._seq.merge(self[0].idx, self[-1].idx + len(self[-1]), tag, lemma, ent_type) + property root: """The first ancestor of the first word of the span that has its head outside the span. From 430e2edb9652626daa19081546308cb7394352e4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 03:51:50 +0200 Subject: [PATCH 004/138] * Fix noun_chunks issue --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index ef901291d..f19df3f4e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -158,7 +158,7 @@ cdef class Doc: def noun_chunks(self): """Yield spans for base noun phrases.""" cdef const TokenC* word - labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj'] + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'conj'] np_deps = [self.vocab.strings[label] for label in labels] np_label = self.vocab.strings['NP'] for i in range(self.length): From 78a90683190416525ea8357f028867df6fcbe6bf Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 04:25:58 +0200 Subject: [PATCH 005/138] * Fix spacy attr on merged tokens --- spacy/tokens/doc.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f19df3f4e..19a1c922e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -364,6 +364,7 @@ cdef class Doc: cdef TokenC* token = &self.data[start] # Update fields token.lex = lex + token.spacy = self.data[end].spacy # What to do about morphology?? # TODO: token.morph = ??? token.tag = self.vocab.strings[tag] From af84669306306365edbdba5d49ce947f57e2102a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 05:12:48 +0200 Subject: [PATCH 006/138] * Add smart-quote possessive marker to tokenizer --- lang_data/en/suffix.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lang_data/en/suffix.txt b/lang_data/en/suffix.txt index 5ac21dbc9..d8c6bc2c2 100644 --- a/lang_data/en/suffix.txt +++ b/lang_data/en/suffix.txt @@ -16,6 +16,8 @@ '' 's 'S +’s +’S ’ \.\. \.\.\. From 4988356cf062fa61343ac678f70a36bdcb89c1dd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 1 Aug 2015 00:33:24 +0200 Subject: [PATCH 007/138] * Fix dependency type bug from merged tokens --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 19a1c922e..22096d4ed 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -380,11 +380,11 @@ cdef class Doc: # Before thinking of something simpler, beware the case where a dependency # bridges over the entity. Here the alignment of the tokens changes. span_root = span.root.i + token.dep = span.root.dep for i in range(self.length): self.data[i].head += i # Set the head of the merged token, and its dep relation, from the Span token.head = self.data[span_root].head - token.dep = span.root.dep # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end From eb7138c761392787d03cc10076d64026dcae50e3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 1 Aug 2015 00:34:40 +0200 Subject: [PATCH 008/138] * Add attr relation in base NP detection --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 22096d4ed..a3ae45733 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -158,7 +158,7 @@ cdef class Doc: def noun_chunks(self): """Yield spans for base noun phrases.""" cdef const TokenC* word - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'conj'] + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'conj', 'attr'] np_deps = [self.vocab.strings[label] for label in labels] np_label = self.vocab.strings['NP'] for i in range(self.length): From 4c87a696b3229bd41a111e607ced9495b5eef604 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 4 Aug 2015 15:55:28 +0200 Subject: [PATCH 009/138] * Add draft dfa matcher, in Python. Passing tests. --- spacy/matcher.pyx | 52 +++++++++++++++++++++++++++++++++++++++++++ tests/test_matcher.py | 52 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 spacy/matcher.pyx create mode 100644 tests/test_matcher.py diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx new file mode 100644 index 000000000..c19fd8919 --- /dev/null +++ b/spacy/matcher.pyx @@ -0,0 +1,52 @@ +class MatchState(object): + def __init__(self, token_spec, ext): + self.token_spec = token_spec + self.ext = ext + self.is_final = False + + def match(self, token): + for attr, value in self.token_spec: + if getattr(token, attr) != value: + return False + else: + return True + + def __repr__(self): + return '' % (self.token_spec) + + +class EndState(object): + def __init__(self, entity_type, length): + self.entity_type = entity_type + self.length = length + self.is_final = True + + def __call__(self, token): + return (self.entity_type, ((token.i+1) - self.length), token.i+1) + + def __repr__(self): + return '' % (self.entity_type) + + +class Matcher(object): + def __init__(self, patterns): + self.start_states = [] + for token_specs, entity_type in patterns: + state = EndState(entity_type, len(token_specs)) + for spec in reversed(token_specs): + state = MatchState(spec, state) + self.start_states.append(state) + + def __call__(self, tokens): + queue = list(self.start_states) + matches = [] + for token in tokens: + next_queue = list(self.start_states) + for pattern in queue: + if pattern.match(token): + if pattern.ext.is_final: + matches.append(pattern.ext(token)) + else: + next_queue.append(pattern.ext) + queue = next_queue + return matches diff --git a/tests/test_matcher.py b/tests/test_matcher.py new file mode 100644 index 000000000..391d9526c --- /dev/null +++ b/tests/test_matcher.py @@ -0,0 +1,52 @@ +from __future__ import unicode_literals +import pytest + +from spacy.matcher import * + + +class MockToken(object): + def __init__(self, i, string): + self.i = i + self.orth_ = string + + +def make_tokens(string): + return [MockToken(i, s) for i, s in enumerate(string.split())] + + +@pytest.fixture +def matcher(): + specs = [] + for string in ['JavaScript', 'Google Now', 'Java']: + spec = tuple([[('orth_', orth)] for orth in string.split()]) + specs.append((spec, 'product')) + return Matcher(specs) + + +def test_compile(matcher): + assert len(matcher.start_states) == 3 + + +def test_no_match(matcher): + tokens = make_tokens('I like cheese') + assert matcher(tokens) == [] + + +def test_match_start(matcher): + tokens = make_tokens('JavaScript is good') + assert matcher(tokens) == [('product', 0, 1)] + + +def test_match_end(matcher): + tokens = make_tokens('I like Java') + assert matcher(tokens) == [('product', 2, 3)] + + +def test_match_middle(matcher): + tokens = make_tokens('I like Google Now best') + assert matcher(tokens) == [('product', 2, 4)] + + +def test_match_multi(matcher): + tokens = make_tokens('I like Google Now and Java best') + assert matcher(tokens) == [('product', 2, 4), ('product', 5, 6)] From 5bc0e83f9a2c1ec23231732ccf74444411ab2717 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 5 Aug 2015 01:05:54 +0200 Subject: [PATCH 010/138] * Reimplement matching in Cython, instead of Python. --- spacy/matcher.pyx | 130 +++++++++++++++++++++++++++++------------- tests/test_matcher.py | 57 +++++++++--------- 2 files changed, 117 insertions(+), 70 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index c19fd8919..f6f3f95d6 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,52 +1,100 @@ -class MatchState(object): - def __init__(self, token_spec, ext): - self.token_spec = token_spec - self.ext = ext - self.is_final = False +from .typedefs cimport attr_t +from .attrs cimport attr_id_t +from .structs cimport TokenC - def match(self, token): - for attr, value in self.token_spec: - if getattr(token, attr) != value: - return False - else: - return True +from cymem.cymem cimport Pool +from libcpp.vector cimport vector - def __repr__(self): - return '' % (self.token_spec) +from .attrs cimport LENGTH, ENT_TYPE +from .tokens.doc cimport get_token_attr +from .tokens.doc cimport Doc +from .vocab cimport Vocab -class EndState(object): - def __init__(self, entity_type, length): - self.entity_type = entity_type - self.length = length - self.is_final = True - - def __call__(self, token): - return (self.entity_type, ((token.i+1) - self.length), token.i+1) - - def __repr__(self): - return '' % (self.entity_type) +cdef struct AttrValue: + attr_id_t attr + attr_t value -class Matcher(object): +cdef struct Pattern: + AttrValue* spec + int length + + +cdef Pattern* init_pattern(Pool mem, object token_specs, attr_t entity_type) except NULL: + pattern = mem.alloc(len(token_specs) + 1, sizeof(Pattern)) + cdef int i + for i, spec in enumerate(token_specs): + pattern[i].spec = mem.alloc(len(spec), sizeof(AttrValue)) + pattern[i].length = len(spec) + for j, (attr, value) in enumerate(spec): + pattern[i].spec[j].attr = attr + pattern[i].spec[j].value = value + i = len(token_specs) + pattern[i].spec = mem.alloc(1, sizeof(AttrValue)) + pattern[i].spec[0].attr = ENT_TYPE + pattern[i].spec[0].value = entity_type + pattern[i].spec[1].attr = LENGTH + pattern[i].spec[1].value = len(token_specs) + pattern[i].length = 0 + return pattern + + +cdef int match(const Pattern* pattern, const TokenC* token) except -1: + cdef int i + for i in range(pattern.length): + if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: + return False + return True + + +cdef int is_final(const Pattern* pattern) except -1: + return (pattern + 1).length == 0 + + +cdef object get_entity(const Pattern* pattern, const TokenC* tokens, int i): + pattern += 1 + i += 1 + return (pattern.spec[0].value, i - pattern.spec[1].value, i) + + +cdef class Matcher: + cdef Pool mem + cdef Pattern** patterns + cdef readonly int n_patterns + def __init__(self, patterns): - self.start_states = [] - for token_specs, entity_type in patterns: - state = EndState(entity_type, len(token_specs)) - for spec in reversed(token_specs): - state = MatchState(spec, state) - self.start_states.append(state) + self.mem = Pool() + self.patterns = self.mem.alloc(len(patterns), sizeof(Pattern*)) + for i, (token_specs, entity_type) in enumerate(patterns): + self.patterns[i] = init_pattern(self.mem, token_specs, entity_type) + self.n_patterns = len(patterns) - def __call__(self, tokens): - queue = list(self.start_states) + def __call__(self, Doc doc): + cdef vector[Pattern*] partials + cdef int n_partials = 0 + cdef int q = 0 + cdef int i, token_i + cdef const TokenC* token + cdef Pattern* state matches = [] - for token in tokens: - next_queue = list(self.start_states) - for pattern in queue: - if pattern.match(token): - if pattern.ext.is_final: - matches.append(pattern.ext(token)) + for token_i in range(doc.length): + token = &doc.data[token_i] + q = 0 + for i in range(partials.size()): + state = partials.at(i) + if match(state, token): + if is_final(state): + matches.append(get_entity(state, token, token_i)) else: - next_queue.append(pattern.ext) - queue = next_queue + partials[q] = state + 1 + q += 1 + partials.resize(q) + for i in range(self.n_patterns): + state = self.patterns[i] + if match(state, token): + if is_final(state): + matches.append(get_entity(state, token, token_i)) + else: + partials.push_back(state + 1) return matches diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 391d9526c..fb3665623 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -1,52 +1,51 @@ from __future__ import unicode_literals import pytest +from spacy.strings import StringStore from spacy.matcher import * - - -class MockToken(object): - def __init__(self, i, string): - self.i = i - self.orth_ = string - - -def make_tokens(string): - return [MockToken(i, s) for i, s in enumerate(string.split())] +from spacy.attrs import ORTH +from spacy.tokens.doc import Doc +from spacy.vocab import Vocab @pytest.fixture -def matcher(): +def matcher(EN): specs = [] for string in ['JavaScript', 'Google Now', 'Java']: - spec = tuple([[('orth_', orth)] for orth in string.split()]) - specs.append((spec, 'product')) + spec = [] + for orth_ in string.split(): + spec.append([(ORTH, EN.vocab.strings[orth_])]) + specs.append((spec, EN.vocab.strings['product'])) return Matcher(specs) def test_compile(matcher): - assert len(matcher.start_states) == 3 + assert matcher.n_patterns == 3 - -def test_no_match(matcher): - tokens = make_tokens('I like cheese') +def test_no_match(matcher, EN): + tokens = EN('I like cheese') assert matcher(tokens) == [] -def test_match_start(matcher): - tokens = make_tokens('JavaScript is good') - assert matcher(tokens) == [('product', 0, 1)] +def test_match_start(matcher, EN): + tokens = EN('JavaScript is good') + assert matcher(tokens) == [(EN.vocab.strings['product'], 0, 1)] -def test_match_end(matcher): - tokens = make_tokens('I like Java') - assert matcher(tokens) == [('product', 2, 3)] +def test_match_end(matcher, EN): + tokens = EN('I like Java') + assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 3)] -def test_match_middle(matcher): - tokens = make_tokens('I like Google Now best') - assert matcher(tokens) == [('product', 2, 4)] +def test_match_middle(matcher, EN): + tokens = EN('I like Google Now best') + assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4)] -def test_match_multi(matcher): - tokens = make_tokens('I like Google Now and Java best') - assert matcher(tokens) == [('product', 2, 4), ('product', 5, 6)] +def test_match_multi(matcher, EN): + tokens = EN('I like Google Now and Java best') + assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4), + (EN.vocab.strings['product'], 5, 6)] + +def test_dummy(): + pass From 47db3067a0bda8ab6cb8adada5affd0cb2a22a72 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 5 Aug 2015 23:48:11 +0200 Subject: [PATCH 011/138] * Compile spacy.matcher --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3617b66dd..218272504 100644 --- a/setup.py +++ b/setup.py @@ -164,7 +164,7 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.gold', 'spacy.orth', 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', - 'spacy.cfile', + 'spacy.cfile', 'spacy.matcher', 'spacy.syntax.ner'] From 9c1724ecaef838a9c510b7b2b42d48e4ec5ccad6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 00:35:40 +0200 Subject: [PATCH 012/138] * Gazetteer stuff working, now need to wire up to API --- spacy/matcher.pyx | 1 + spacy/syntax/ner.pyx | 40 ++++++++++++++- spacy/syntax/stateclass.pxd | 2 +- spacy/tokens/doc.pxd | 5 ++ spacy/tokens/doc.pyx | 91 +++++++++++++++++++++------------ tests/spans/test_merge.py | 19 ++++++- tests/test_matcher.py | 13 ++++- tests/tokens/test_tokens_api.py | 13 ++++- 8 files changed, 146 insertions(+), 38 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index f6f3f95d6..b1b77e162 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -97,4 +97,5 @@ cdef class Matcher: matches.append(get_entity(state, token, token_i)) else: partials.push_back(state + 1) + doc.ents = list(sorted(list(doc.ents) + matches)) return matches diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index fbd580b29..8fa4a03d5 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -160,7 +160,17 @@ cdef class Missing: cdef class Begin: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: - return label != 0 and not st.entity_is_open() + # Ensure we don't clobber preset entities. If no entity preset, + # ent_iob is 0 + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 1: + return False + elif preset_ent_iob == 2: + return False + elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + return False + else: + return label != 0 and not st.entity_is_open() @staticmethod cdef int transition(StateClass st, int label) nogil: @@ -190,6 +200,14 @@ cdef class Begin: cdef class In: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + # TODO: Is this quite right? + elif st.B_(1).ent_iob != preset_ent_iob: + return False return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -230,6 +248,14 @@ cdef class In: cdef class Last: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 3: + return False + elif st.B_(1).ent_iob == 1: + return False + return st.entity_is_open() and label != 0 and st.E_(0).ent_type == label @staticmethod @@ -269,6 +295,13 @@ cdef class Last: cdef class Unit: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 2: + return False + elif preset_ent_iob == 1: + return False + elif st.B_(1).ent_iob == 1: + return False return label != 0 and not st.entity_is_open() @staticmethod @@ -300,6 +333,11 @@ cdef class Unit: cdef class Out: @staticmethod cdef bint is_valid(StateClass st, int label) nogil: + cdef int preset_ent_iob = st.B_(0).ent_iob + if preset_ent_iob == 3: + return False + elif preset_ent_iob == 1: + return False return not st.entity_is_open() @staticmethod diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 905d8cdde..888b01c32 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -125,7 +125,7 @@ cdef class StateClass: cdef void add_arc(self, int head, int child, int label) nogil cdef void del_arc(self, int head, int child) nogil - + cdef void open_ent(self, int label) nogil cdef void close_ent(self) nogil diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7de5e0bea..121018770 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -4,6 +4,11 @@ from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC +from ..typedefs cimport attr_t +from ..attrs cimport attr_id_t + + +cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a3ae45733..6d0cd9a8b 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -119,40 +119,67 @@ cdef class Doc: def string(self): return u''.join([t.string for t in self]) - @property - def ents(self): - """Yields named-entity Span objects. + property ents: + def __get__(self): + """Yields named-entity Span objects. - Iterate over the span to get individual Token objects, or access the label: + Iterate over the span to get individual Token objects, or access the label: - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') - >>> ents = list(tokens.ents) - >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) - (112504, u'PERSON', u'Best ') - """ - cdef int i - cdef const TokenC* token - cdef int start = -1 - cdef int label = 0 - for i in range(self.length): - token = &self.data[i] - if token.ent_iob == 1: - assert start != -1 - pass - elif token.ent_iob == 2: - if start != -1: - yield Span(self, start, i, label=label) - start = -1 - label = 0 - elif token.ent_iob == 3: - if start != -1: - yield Span(self, start, i, label=label) - start = i - label = token.ent_type - if start != -1: - yield Span(self, start, self.length, label=label) + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp(u'Mr. Best flew to New York on Saturday morning.') + >>> ents = list(tokens.ents) + >>> ents[0].label, ents[0].label_, ''.join(t.orth_ for t in ents[0]) + (112504, u'PERSON', u'Best ') + """ + cdef int i + cdef const TokenC* token + cdef int start = -1 + cdef int label = 0 + output = [] + for i in range(self.length): + token = &self.data[i] + if token.ent_iob == 1: + assert start != -1 + elif token.ent_iob == 2 or token.ent_iob == 0: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = -1 + label = 0 + elif token.ent_iob == 3: + if start != -1: + output.append(Span(self, start, i, label=label)) + start = i + label = token.ent_type + if start != -1: + output.append(Span(self, start, self.length, label=label)) + return tuple(output) + + def __set__(self, ents): + # TODO: + # 1. Allow negative matches + # 2. Ensure pre-set NERs are not over-written during statistical prediction + # 3. Test basic data-driven ORTH gazetteer + # 4. Test more nuanced date and currency regex + cdef int i + for i in range(self.length): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 0 + cdef attr_t ent_type + cdef int start, end + for ent_type, start, end in ents: + if ent_type is None: + # Mark as O + for i in range(start, end): + self.data[i].ent_type = 0 + self.data[i].ent_iob = 2 + else: + # Mark (inside) as I + for i in range(start, end): + self.data[i].ent_type = ent_type + self.data[i].ent_iob = 1 + # Set start as B + self.data[start].ent_iob = 3 @property def noun_chunks(self): diff --git a/tests/spans/test_merge.py b/tests/spans/test_merge.py index 3bba13064..e225db043 100644 --- a/tests/spans/test_merge.py +++ b/tests/spans/test_merge.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import pytest - @pytest.mark.models def test_merge_tokens(EN): tokens = EN(u'Los Angeles start.') @@ -32,3 +31,21 @@ def test_merge_heads(EN): def test_issue_54(EN): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = EN(text, merge_mwes=True) + +@pytest.mark.models +def test_np_merges(EN): + text = u'displaCy is a parse tool built with Javascript' + tokens = EN(text) + assert tokens[4].head.i == 1 + tokens.merge(tokens[2].idx, tokens[4].idx + len(tokens[4]), u'NP', u'tool', u'O') + assert tokens[2].head.i == 1 + tokens = EN('displaCy is a lightweight and modern dependency parse tree visualization tool built with CSS3 and JavaScript.') + + ents = [(e[0].idx, e[-1].idx + len(e[-1]), e.label_, e.lemma_) + for e in tokens.ents] + for start, end, label, lemma in ents: + merged = tokens.merge(start, end, label, lemma, label) + assert merged != None, (start, end, label, lemma) + for tok in tokens: + print tok.orth_, tok.dep_, tok.head.orth_ + diff --git a/tests/test_matcher.py b/tests/test_matcher.py index fb3665623..06950253c 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -47,5 +47,14 @@ def test_match_multi(matcher, EN): assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4), (EN.vocab.strings['product'], 5, 6)] -def test_dummy(): - pass +def test_match_preserved(matcher, EN): + doc = EN.tokenizer('I like Java') + EN.tagger(doc) + EN.entity(doc) + assert len(doc.ents) == 0 + doc = EN.tokenizer('I like Java') + matcher(doc) + assert len(doc.ents) == 1 + EN.tagger(doc) + EN.entity(doc) + assert len(doc.ents) == 1 diff --git a/tests/tokens/test_tokens_api.py b/tests/tokens/test_tokens_api.py index b935bbce7..e1238373f 100644 --- a/tests/tokens/test_tokens_api.py +++ b/tests/tokens/test_tokens_api.py @@ -4,7 +4,6 @@ from spacy.tokens import Doc import pytest - @pytest.mark.models def test_getitem(EN): tokens = EN(u'Give it back! He pleaded.') @@ -32,3 +31,15 @@ def test_serialize_whitespace(EN): assert tokens.string == new_tokens.string assert [t.orth_ for t in tokens] == [t.orth_ for t in new_tokens] assert [t.orth for t in tokens] == [t.orth for t in new_tokens] + + +def test_set_ents(EN): + tokens = EN.tokenizer(u'I use goggle chrone to surf the web') + assert len(tokens.ents) == 0 + tokens.ents = [(EN.vocab.strings['PRODUCT'], 2, 4)] + assert len(list(tokens.ents)) == 1 + assert [t.ent_iob for t in tokens] == [0, 0, 3, 1, 0, 0, 0, 0] + ent = tokens.ents[0] + assert ent.label_ == 'PRODUCT' + assert ent.start == 2 + assert ent.end == 4 From 5737115e1e58e163c9f3e892eece78155c0727e5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 14:33:21 +0200 Subject: [PATCH 013/138] * Work on gazetteer matching --- spacy/en/__init__.py | 8 ++++++ spacy/matcher.pyx | 58 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 62 insertions(+), 4 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 5bf83a253..c81630a72 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -11,6 +11,7 @@ from ..syntax.arc_eager import ArcEager from ..syntax.ner import BiluoPushDown from ..syntax.parser import ParserFactory from ..serialize.bits import BitArray +from ..matcher import Matcher from ..tokens import Doc from ..multi_words import RegexMerger @@ -75,6 +76,7 @@ class English(object): Tagger=EnPosTagger, Parser=ParserFactory(ParserTransitionSystem), Entity=ParserFactory(EntityTransitionSystem), + Matcher=Matcher.from_dir, Packer=None, load_vectors=True ): @@ -113,6 +115,10 @@ class English(object): self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) else: self.entity = None + if Matcher: + self.matcher = Matcher(self.vocab, data_dir) + else: + self.matcher = None if Packer: self.packer = Packer(self.vocab, data_dir) else: @@ -143,6 +149,8 @@ class English(object): tokens = self.tokenizer(text) if self.tagger and tag: self.tagger(tokens) + if self.matcher and entity: + self.matcher(tokens) if self.parser and parse: self.parser(tokens) if self.entity and entity: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index b1b77e162..ab3ef354b 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,3 +1,5 @@ +from os import path + from .typedefs cimport attr_t from .attrs cimport attr_id_t from .structs cimport TokenC @@ -5,11 +7,16 @@ from .structs cimport TokenC from cymem.cymem cimport Pool from libcpp.vector cimport vector -from .attrs cimport LENGTH, ENT_TYPE +from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab +try: + import ujson as json +except ImportError: + import json + cdef struct AttrValue: attr_id_t attr @@ -58,18 +65,61 @@ cdef object get_entity(const Pattern* pattern, const TokenC* tokens, int i): return (pattern.spec[0].value, i - pattern.spec[1].value, i) +def _convert_strings(token_specs, string_store): + converted = [] + for spec in token_specs: + converted.append([]) + for attr, value in spec.items(): + if isinstance(attr, basestring): + attr = map_attr_name(attr) + if isinstance(value, basestring): + value = string_store[value] + converted[-1].append((attr, value)) + return converted + + +def map_attr_name(attr): + attr = attr.upper() + if attr == 'ORTH': + return ORTH + elif attr == 'LEMMA': + return LEMMA + elif attr == 'LOWER': + return LOWER + elif attr == 'SHAOE': + return SHAPE + elif attr == 'NORM': + return NORM + else: + raise Exception("TODO: Finish supporting attr mapping %s" % attr) + + cdef class Matcher: cdef Pool mem cdef Pattern** patterns cdef readonly int n_patterns - def __init__(self, patterns): + def __init__(self, vocab, patterns): self.mem = Pool() self.patterns = self.mem.alloc(len(patterns), sizeof(Pattern*)) - for i, (token_specs, entity_type) in enumerate(patterns): - self.patterns[i] = init_pattern(self.mem, token_specs, entity_type) + for i, (entity_key, (etype, attrs, specs)) in enumerate(sorted(patterns.items())): + if isinstance(entity_key, basestring): + entity_key = vocab.strings[entity_key] + if isinstance(etype, basestring): + etype = vocab.strings[etype] + specs = _convert_strings(specs, vocab.strings) + self.patterns[i] = init_pattern(self.mem, specs, etype) self.n_patterns = len(patterns) + @classmethod + def from_dir(cls, vocab, data_dir): + patterns_loc = path.join(data_dir, 'ner', 'patterns.json') + if path.exists(patterns_loc): + patterns = json.loads(open(patterns_loc)) + return cls(vocab, patterns) + else: + return cls(vocab, {}) + def __call__(self, Doc doc): cdef vector[Pattern*] partials cdef int n_partials = 0 From faf75dfcb97a787d16a076754c384a801f770f74 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 14:33:35 +0200 Subject: [PATCH 014/138] * Update matcher tests --- tests/test_matcher.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 06950253c..62b70b520 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -10,18 +10,18 @@ from spacy.vocab import Vocab @pytest.fixture def matcher(EN): - specs = [] - for string in ['JavaScript', 'Google Now', 'Java']: - spec = [] - for orth_ in string.split(): - spec.append([(ORTH, EN.vocab.strings[orth_])]) - specs.append((spec, EN.vocab.strings['product'])) - return Matcher(specs) + patterns = { + 'Javascript': ['PRODUCT', {}, [{'ORTH': 'JavaScript'}]], + 'GoogleNow': ['PRODUCT', {}, [{'ORTH': 'Google'}, {'ORTH': 'Now'}]], + 'Java': ['PRODUCT', {}, [{'ORTH': 'Java'}]], + } + return Matcher(EN.vocab, patterns) def test_compile(matcher): assert matcher.n_patterns == 3 + def test_no_match(matcher, EN): tokens = EN('I like cheese') assert matcher(tokens) == [] @@ -29,23 +29,24 @@ def test_no_match(matcher, EN): def test_match_start(matcher, EN): tokens = EN('JavaScript is good') - assert matcher(tokens) == [(EN.vocab.strings['product'], 0, 1)] + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 0, 1)] def test_match_end(matcher, EN): tokens = EN('I like Java') - assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 3)] + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] def test_match_middle(matcher, EN): tokens = EN('I like Google Now best') - assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4)] + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4)] def test_match_multi(matcher, EN): tokens = EN('I like Google Now and Java best') - assert matcher(tokens) == [(EN.vocab.strings['product'], 2, 4), - (EN.vocab.strings['product'], 5, 6)] + assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), + (EN.vocab.strings['PRODUCT'], 5, 6)] + def test_match_preserved(matcher, EN): doc = EN.tokenizer('I like Java') From 3ecacb96358154c4ccb744455f793f747b0c1be4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:07:23 +0200 Subject: [PATCH 015/138] * Copy gazetteer file in init_model --- bin/init_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/init_model.py b/bin/init_model.py index db01885b3..3307bffa8 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -189,6 +189,10 @@ def main(lang_data_dir, corpora_dir, model_dir): setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') setup_vocab(corpora_dir, model_dir / 'vocab') + + if (lang_data_dir / 'gazetteer.json').exists(): + copyfile(str(lang_data_dir / 'gazetteer.json'), + str(model_dir / 'vocab' / 'gazetteer.json')) if not (model_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) From c2635774247706c1ce503cf6af8be15d520885ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:07:41 +0200 Subject: [PATCH 016/138] * Fix lower attribute in lexeme.pxd --- spacy/lexeme.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index e0c99b3e6..f7b210281 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -87,7 +87,7 @@ cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: elif feat_name == ORTH: return lex.orth elif feat_name == LOWER: - return lex.norm + return lex.lower elif feat_name == NORM: return lex.norm elif feat_name == SHAPE: From 9c667b7f15c59dfa64793525b42ca59828ffeb4b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:08:04 +0200 Subject: [PATCH 017/138] * Set a value in attrs.pxd on the first flag, to reduce bugs --- spacy/attrs.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 2c3e2849d..d2ace1cff 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -14,7 +14,7 @@ cpdef enum attr_id_t: IS_STOP IS_OOV - FLAG13 + FLAG13 = 13 FLAG14 FLAG15 FLAG16 From cd7d1682cd074aad3dc0a0c6b354548fc863c455 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:08:25 +0200 Subject: [PATCH 018/138] * Fix loading of gazetteer.json file --- spacy/matcher.pyx | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ab3ef354b..5aee7ee8f 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -101,21 +101,28 @@ cdef class Matcher: def __init__(self, vocab, patterns): self.mem = Pool() - self.patterns = self.mem.alloc(len(patterns), sizeof(Pattern*)) - for i, (entity_key, (etype, attrs, specs)) in enumerate(sorted(patterns.items())): + n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()]) + self.patterns = self.mem.alloc(n_patterns, sizeof(Pattern*)) + cdef int i = 0 + for entity_key, (etype, attrs, specs) in sorted(patterns.items()): if isinstance(entity_key, basestring): entity_key = vocab.strings[entity_key] if isinstance(etype, basestring): etype = vocab.strings[etype] - specs = _convert_strings(specs, vocab.strings) - self.patterns[i] = init_pattern(self.mem, specs, etype) + # TODO: Do something more clever about multiple patterns for single + # entity + for spec in specs: + spec = _convert_strings(spec, vocab.strings) + self.patterns[i] = init_pattern(self.mem, spec, etype) + i += 1 self.n_patterns = len(patterns) @classmethod def from_dir(cls, vocab, data_dir): - patterns_loc = path.join(data_dir, 'ner', 'patterns.json') + patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') if path.exists(patterns_loc): - patterns = json.loads(open(patterns_loc)) + patterns_data = open(patterns_loc).read() + patterns = json.loads(patterns_data) return cls(vocab, patterns) else: return cls(vocab, {}) From 59c3bf60a6ac4566a5a562d700a85e97856143d2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:09:08 +0200 Subject: [PATCH 019/138] * Ensure entity recognizer doesn't over-write preset types --- spacy/syntax/ner.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 8fa4a03d5..c569bfa43 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -300,6 +300,8 @@ cdef class Unit: return False elif preset_ent_iob == 1: return False + elif preset_ent_iob == 3 and st.B_(0).ent_type != label: + return False elif st.B_(1).ent_iob == 1: return False return label != 0 and not st.entity_is_open() From 27679791352f2be4a275641acc03af60eb34da95 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:09:28 +0200 Subject: [PATCH 020/138] * Update matcher tests --- tests/test_matcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 62b70b520..0014e1110 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -11,9 +11,9 @@ from spacy.vocab import Vocab @pytest.fixture def matcher(EN): patterns = { - 'Javascript': ['PRODUCT', {}, [{'ORTH': 'JavaScript'}]], - 'GoogleNow': ['PRODUCT', {}, [{'ORTH': 'Google'}, {'ORTH': 'Now'}]], - 'Java': ['PRODUCT', {}, [{'ORTH': 'Java'}]], + 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], + 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], + 'Java': ['PRODUCT', {}, [[{'ORTH': 'Java'}]]], } return Matcher(EN.vocab, patterns) From 91a94e152ba56d924d10bb4733ff2edc4a156013 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:10:04 +0200 Subject: [PATCH 021/138] * Make initial gazetteer --- lang_data/en/gazetteer.json | 65 +++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 lang_data/en/gazetteer.json diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json new file mode 100644 index 000000000..d7e7a61a1 --- /dev/null +++ b/lang_data/en/gazetteer.json @@ -0,0 +1,65 @@ +{ + "Reddit": [ + "PRODUCT", + {}, + [ + [{"lower": "reddit"}] + ] + ], + "SeptemberElevenAttacks": [ + "EVENT", + {}, + [ + [ + {"orth": "9/11"} + ], + [ + {"lower": "Septmber"}, + {"lower": "Eleven"} + ], + [ + {"lower": "september"}, + {"orth": "11"} + ] + ] + ], + "Linux": [ + "PRODUCT", + {}, + [ + [{"lower": "linux"}] + ] + ], + "Haskell": [ + "PRODUCT", + {}, + [ + [{"lower": "haskell"}] + ] + ], + "HaskellCurry": [ + "PERSON", + {}, + [ + [ + {"lower": "haskell"}, + {"lower": "curry"} + ] + ] + ], + "Javascript": [ + "PRODUCT", + {}, + [ + [{"lower": "javascript"}] + ] + ], + "CSS": [ + "PRODUCT", + {}, + [ + [{"lower": "css"}], + [{"lower": "css3"}] + ] + ] +} \ No newline at end of file From 383dfabd673b5835a0ffd5f3c38617c089fb1c6a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:27:01 +0200 Subject: [PATCH 022/138] * Fix matcher setting of entities --- spacy/matcher.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 5aee7ee8f..7d5d4062a 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -154,5 +154,5 @@ cdef class Matcher: matches.append(get_entity(state, token, token_i)) else: partials.push_back(state + 1) - doc.ents = list(sorted(list(doc.ents) + matches)) + doc.ents = [(e.label, e.start, e.end) for e in doc.ents] + matches return matches From 8b8df851ca1885968fcd3bb18d1052990ca41e85 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:28:31 +0200 Subject: [PATCH 023/138] * Fix print statement in test_merge --- tests/spans/test_merge.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/spans/test_merge.py b/tests/spans/test_merge.py index e225db043..a47e4e53a 100644 --- a/tests/spans/test_merge.py +++ b/tests/spans/test_merge.py @@ -46,6 +46,4 @@ def test_np_merges(EN): for start, end, label, lemma in ents: merged = tokens.merge(start, end, label, lemma, label) assert merged != None, (start, end, label, lemma) - for tok in tokens: - print tok.orth_, tok.dep_, tok.head.orth_ From 10d869d10271146a393c5ebc8e2db7ceffcb4a8a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:31:53 +0200 Subject: [PATCH 024/138] * Don't allow conjunction between NPs in base NP chunks --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 6d0cd9a8b..74df27f07 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -185,7 +185,7 @@ cdef class Doc: def noun_chunks(self): """Yield spans for base noun phrases.""" cdef const TokenC* word - labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'conj', 'attr'] + labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr'] np_deps = [self.vocab.strings[label] for label in labels] np_label = self.vocab.strings['NP'] for i in range(self.length): From 5c3c9620384f36575b922499c3df7df0e53584d7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:34:51 +0200 Subject: [PATCH 025/138] * Add html to gazetteer --- lang_data/en/gazetteer.json | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index d7e7a61a1..b6e64726b 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -62,4 +62,13 @@ [{"lower": "css3"}] ] ] -} \ No newline at end of file + "HTML": [ + "PRODUCT", + {}, + [ + [{"lower": "html"}], + [{"lower": "html5"}] + ] + ] + +} From 832896ea6c3958766b2e4170307bb0d3cc1cb374 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 16:36:54 +0200 Subject: [PATCH 026/138] * Add html to gazetteer --- lang_data/en/gazetteer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index b6e64726b..44cb0e780 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -61,7 +61,7 @@ [{"lower": "css"}], [{"lower": "css3"}] ] - ] + ], "HTML": [ "PRODUCT", {}, From 6fcc3df9895ee1d4a712327d2204a8eae294c598 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 17:11:00 +0200 Subject: [PATCH 027/138] * Expand gazetteer with some of the errors from the reddit parse --- lang_data/en/gazetteer.json | 108 +++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 44cb0e780..2726d75af 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -70,5 +70,111 @@ [{"lower": "html5"}] ] ] - + "Python": [ + "PRODUCT", + {}, + [ + [{"orth": "Python"}] + ] + ], + "Ruby": [ + "PRODUCT", + {}, + [ + [{"orth": "Ruby"}] + ] + ], + "Digg": [ + "PRODUCT", + {}, + [ + [{"lower": "digg"}] + ] + ], + "Fox": [ + "ORG", + {}, + [ + [{"orth": "Fox"}] + ] + ], + "Google": [ + "ORG", + {}, + [ + [{"lower": "google"}] + ] + ], + "Mac": [ + "PRODUCT", + {}, + [ + [{"lower": "mac"}] + ] + ], + "Wikipedia": [ + "PRODUCT", + {}, + [ + [{"lower": "wikipedia"}] + ] + ], + "Windows": [ + "PRODUCT", + {}, + [ + [{"orth": "Windows"}] + ] + ], + "Dell": [ + "ORG", + {}, + [ + [{"lower": "dell"}] + ] + ], + "Facebook": [ + "ORG", + {}, + [ + [{"lower": "facebook"}] + ] + ], + "Blizzard": [ + "ORG", + {}, + [ + [{"orth": "Facebook"}] + ] + ], + "Ubuntu": [ + "ORG", + {}, + [ + [{"orth": "Ubuntu"}] + ] + ], + "Youtube": [ + "PRODUCT", + {}, + [ + [{"lower": "youtube"}] + ] + ], + "false_positives": [ + null, + {}, + [{"orth": "Shit"}], + [{"orth": "Weed"}], + [{"orth": "Cool"}], + [{"orth": "Btw"}], + [{"orth": "Bah"}], + [{"orth": "Bullshit"}], + [{"orth": "Lol"}], + [{"orth": "Yo"}, {"orth": "dawg"}], + [{"orth": "Yay"}], + [{"orth": "Ahh"}], + [{"orth": "Yea"}], + [{"orth": "Bah"}] + ] } From 0e098815cc81d423c7d65f406e24c38c048df35e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 17:13:27 +0200 Subject: [PATCH 028/138] * Expand gazetteer with some of the errors from the reddit parse --- lang_data/en/gazetteer.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 2726d75af..e91b1bcf1 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -171,7 +171,7 @@ [{"orth": "Bah"}], [{"orth": "Bullshit"}], [{"orth": "Lol"}], - [{"orth": "Yo"}, {"orth": "dawg"}], + [{"orth": "Yo"}, {"lower": "dawg"}], [{"orth": "Yay"}], [{"orth": "Ahh"}], [{"orth": "Yea"}], From 855af087fc4ce19421a3345db0964fdc89388c2f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 17:27:51 +0200 Subject: [PATCH 029/138] * Fix gazetteer.json --- lang_data/en/gazetteer.json | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index e91b1bcf1..641b3966d 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -69,7 +69,7 @@ [{"lower": "html"}], [{"lower": "html5"}] ] - ] + ], "Python": [ "PRODUCT", {}, @@ -164,17 +164,19 @@ "false_positives": [ null, {}, - [{"orth": "Shit"}], - [{"orth": "Weed"}], - [{"orth": "Cool"}], - [{"orth": "Btw"}], - [{"orth": "Bah"}], - [{"orth": "Bullshit"}], - [{"orth": "Lol"}], - [{"orth": "Yo"}, {"lower": "dawg"}], - [{"orth": "Yay"}], - [{"orth": "Ahh"}], - [{"orth": "Yea"}], - [{"orth": "Bah"}] + [ + [{"orth": "Shit"}], + [{"orth": "Weed"}], + [{"orth": "Cool"}], + [{"orth": "Btw"}], + [{"orth": "Bah"}], + [{"orth": "Bullshit"}], + [{"orth": "Lol"}], + [{"orth": "Yo"}, {"lower": "dawg"}], + [{"orth": "Yay"}], + [{"orth": "Ahh"}], + [{"orth": "Yea"}], + [{"orth": "Bah"}] + ] ] } From 9f658799911986adc150ae53a1bdcbee7e2137f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 17:28:14 +0200 Subject: [PATCH 030/138] * Fix shape attr bug, and fix handling of false positive matches --- spacy/matcher.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 7d5d4062a..ee2ceaecc 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -86,7 +86,7 @@ def map_attr_name(attr): return LEMMA elif attr == 'LOWER': return LOWER - elif attr == 'SHAOE': + elif attr == 'SHAPE': return SHAPE elif attr == 'NORM': return NORM @@ -109,6 +109,8 @@ cdef class Matcher: entity_key = vocab.strings[entity_key] if isinstance(etype, basestring): etype = vocab.strings[etype] + elif etype is None: + etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: From b0f5c39084266248e21e9c0b2475e009628bcbd7 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 6 Aug 2015 17:28:43 +0200 Subject: [PATCH 031/138] * Fix handling of exclusion entities --- spacy/tokens/doc.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 74df27f07..7994c97c3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -168,7 +168,7 @@ cdef class Doc: cdef attr_t ent_type cdef int start, end for ent_type, start, end in ents: - if ent_type is None: + if ent_type is None or ent_type < 0: # Mark as O for i in range(start, end): self.data[i].ent_type = 0 From 63f86efa8bcea3cb48ef67c19aef2ba879ee02da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 19:14:01 +0200 Subject: [PATCH 032/138] * Add test for specifying initial actions --- tests/parser/test_initial_actions_parse.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 tests/parser/test_initial_actions_parse.py diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py new file mode 100644 index 000000000..2f45a6994 --- /dev/null +++ b/tests/parser/test_initial_actions_parse.py @@ -0,0 +1,8 @@ +import pytest + + +def test_initial(EN): + doc = EN.tokenizer(u'I ate the pizza with anchovies.') + EN.tagger(doc) + EN.parser(doc, initial_actions=['L-nsubj', 'S', 'L-det']) + assert doc[0].head.i == 1 From 67979a800882cc576d1b273c6a805e03aa3cb188 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 19:14:32 +0200 Subject: [PATCH 033/138] * Work on reorganization of docs --- docs/source/reference/index.rst | 3 +- docs/source/reference/loading.rst | 25 +-- docs/source/reference/lookup.rst | 88 +++++++-- docs/source/reference/processing.rst | 98 ++++++---- docs/source/reference/using/document.rst | 126 +++++++----- docs/source/reference/using/span.rst | 62 ++++-- docs/source/reference/using/token.rst | 236 +++++++++++++++-------- 7 files changed, 411 insertions(+), 227 deletions(-) diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst index 0ee19986e..0d0e9cdf2 100644 --- a/docs/source/reference/index.rst +++ b/docs/source/reference/index.rst @@ -54,11 +54,12 @@ and a small usage snippet. .. toctree:: :maxdepth: 4 - loading.rst processing.rst using/document.rst using/span.rst using/token.rst + using/lexeme.rst + lookup.rst .. _English: processing.html diff --git a/docs/source/reference/loading.rst b/docs/source/reference/loading.rst index 83f3aaf5f..15a8d7427 100644 --- a/docs/source/reference/loading.rst +++ b/docs/source/reference/loading.rst @@ -1,27 +1,6 @@ ================= Loading Resources ================= - -99\% of the time, you will load spaCy's resources using a language pipeline class, -e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a -specified directory. By default, spaCy installs data into each language's -package directory, and loads it from there. - -Usually, this is all you will need: - - >>> from spacy.en import English - >>> nlp = English() - -If you need to replace some of the components, you may want to just make your -own pipeline class --- the English class itself does almost no work; it just -applies the modules in order. You can also provide a function or class that -produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, -to customize the pipeline: - - >>> from spacy.en import English - >>> from my_module import MyTagger - >>> nlp = English(Tagger=MyTagger) - In more detail: .. code:: @@ -44,12 +23,12 @@ In more detail: :code:`Tokenizer` :code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc` - + A class/function that creates the tokenizer. :code:`Tagger` / :code:`Parser` / :code:`Entity` :code:`(Vocab vocab, unicode data_dir)(Doc) --> None` - + A class/function that creates the part-of-speech tagger / syntactic dependency parser / named entity recogniser. May be None or False, to disable tagging. diff --git a/docs/source/reference/lookup.rst b/docs/source/reference/lookup.rst index 0b6b9bb89..340cdb6c1 100644 --- a/docs/source/reference/lookup.rst +++ b/docs/source/reference/lookup.rst @@ -17,33 +17,95 @@ up in the vocabulary directly: .. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) - .. py:method:: __len__(self) --> int + .. py:method:: __len__(self) - .. py:method:: __getitem__(self, id: int) --> unicode + :returns: number of words in the vocabulary + :rtype: int - .. py:method:: __getitem__(self, string: unicode) --> int + .. py:method:: __getitem__(self, key_int) - .. py:method:: __setitem__(self, py_str: unicode, props: Dict[str, int[float]) --> None + :param int key: + Integer ID - .. py:method:: dump(self, loc: unicode) --> None + :returns: A Lexeme object - .. py:method:: load_lexemes(self, loc: unicode) --> None + .. py:method:: __getitem__(self, key_str) - .. py:method:: load_vectors(self, loc: unicode) --> None + :param unicode key_str: + A string in the vocabulary + + :rtype: Lexeme + + + .. py:method:: __setitem__(self, orth_str, props) + + :param unicode orth_str: + The orth key + + :param dict props: + A props dictionary + + :returns: None + + .. py:method:: dump(self, loc) + + :param unicode loc: + Path where the vocabulary should be saved + + .. py:method:: load_lexemes(self, loc) + + :param unicode loc: + Path to load the lexemes.bin file from + + .. py:method:: load_vectors(self, loc) + + :param unicode loc: + Path to load the vectors.bin from .. py:class:: strings.StringStore(self) - .. py:method:: __len__(self) --> int + .. py:method:: __len__(self) - .. py:method:: __getitem__(self, id: int) --> unicode + :returns: + Number of strings in the string-store - .. py:method:: __getitem__(self, string: bytes) --> id + .. py:method:: __getitem__(self, key_int) - .. py:method:: __getitem__(self, string: unicode) --> id + :param int key_int: An integer key - .. py:method:: dump(self, loc: unicode) --> None + :returns: + The string that the integer key maps to - .. py:method:: load(self, loc: unicode) --> None + :rtype: unicode + .. py:method:: __getitem__(self, key_unicode) + :param int key_unicode: + A key, as a unicode string + + :returns: + The integer ID of the string. + + :rtype: int + + .. py:method:: __getitem__(self, key_utf8_bytes) + + :param int key_utf8_bytes: + A key, as a UTF-8 encoded byte-string + + :returns: + The integer ID of the string. + + :rtype: + int + + .. py:method:: dump(self, loc) + + :param loc: + File path to save the strings.txt to. + + .. py:method:: load(self, loc) + + :param loc: + File path to load the strings.txt from. diff --git a/docs/source/reference/processing.rst b/docs/source/reference/processing.rst index 5b53c26d6..ec8e8ebca 100644 --- a/docs/source/reference/processing.rst +++ b/docs/source/reference/processing.rst @@ -1,33 +1,76 @@ -=============== -Processing Text -=============== +================ +spacy.en.English +================ + + +99\% of the time, you will load spaCy's resources using a language pipeline class, +e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a +specified directory. By default, spaCy installs data into each language's +package directory, and loads it from there. + +Usually, this is all you will need: + + >>> from spacy.en import English + >>> nlp = English() + +If you need to replace some of the components, you may want to just make your +own pipeline class --- the English class itself does almost no work; it just +applies the modules in order. You can also provide a function or class that +produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, +to customize the pipeline: + + >>> from spacy.en import English + >>> from my_module import MyTagger + >>> nlp = English(Tagger=MyTagger) The text processing API is very small and simple. Everything is a callable object, and you will almost always apply the pipeline all at once. -Applying a pipeline -------------------- +.. py:class:: spacy.en.English + + .. py:method:: __init__(self, data_dir=..., Tokenizer=..., Tagger=..., Parser=..., Entity=..., Matcher=..., Packer=None, load_vectors=True) -.. py:method:: English.__call__(text, tag=True, parse=True, entity=True) --> Doc + :param unicode data_dir: + The data directory. May be None, to disable any data loading (including + the vocabulary). + :param Tokenizer: + A class/function that creates the tokenizer. -text (unicode) - The text to be processed. No pre-processing needs to be applied, and any - length of text can be submitted. Usually you will submit a whole document. - Text may be zero-length. An exception is raised if byte strings are supplied. + :param Tagger: + A class/function that creates the part-of-speech tagger. -tag (bool) - Whether to apply the part-of-speech tagger. Required for parsing and entity recognition. + :param Parser: + A class/function that creates the dependency parser. -parse (bool) - Whether to apply the syntactic dependency parser. + :param Entity: + A class/function that creates the named entity recogniser. -entity (bool) - Whether to apply the named entity recognizer. + :param bool load_vectors: + A boolean value to control whether the word vectors are loaded. + .. py:method:: __call__(text, tag=True, parse=True, entity=True) --> Doc -**Examples** + :param unicode text: + The text to be processed. No pre-processing needs to be applied, and any + length of text can be submitted. Usually you will submit a whole document. + Text may be zero-length. An exception is raised if byte strings are supplied. + + :param bool tag: + Whether to apply the part-of-speech tagger. Required for parsing and entity + recognition. + + :param bool parse: + Whether to apply the syntactic dependency parser. + + :param bool entity: + Whether to apply the named entity recognizer. + + :return: A document + :rtype: :py:class:`spacy.tokens.Doc` + + :Example: >>> from spacy.en import English >>> nlp = English() @@ -44,24 +87,3 @@ entity (bool) TypeError: Argument 'string' has incorrect type (expected unicode, got str) >>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. >>> - - -Tokenizer ---------- - - -.. autoclass:: spacy.tokenizer.Tokenizer - :members: - - -Tagger ------- - -.. autoclass:: spacy.en.pos.EnPosTagger - :members: - -Parser and Entity Recognizer ----------------------------- - -.. autoclass:: spacy.syntax.parser.Parser - :members: diff --git a/docs/source/reference/using/document.rst b/docs/source/reference/using/document.rst index e920fba9e..7507f7f21 100644 --- a/docs/source/reference/using/document.rst +++ b/docs/source/reference/using/document.rst @@ -2,69 +2,93 @@ The Doc Object ============== -.. autoclass:: spacy.tokens.Tokens -:code:`__getitem__`, :code:`__iter__`, :code:`__len__` - The Tokens class behaves as a Python sequence, supporting the usual operators, - len(), etc. Negative indexing is supported. Slices are not yet. +.. py:class:: spacy.tokens.doc.Doc - .. code:: + .. py:method:: __init__(self, Vocab vocab, orths_and_spaces=None) - >>> tokens = nlp(u'Zero one two three four five six') - >>> tokens[0].orth_ - u'Zero' - >>> tokens[-1].orth_ - u'six' - >>> tokens[0:4] - Error + :param Vocab vocab: A vocabulary object. -:code:`sents` - Iterate over sentences in the document. + :param list orths_and_spaces=None: Defaults to None. -:code:`ents` - Iterate over entities in the document. + .. py:method:: __getitem__(self, int i) + + :returns: Token -:code:`to_array` - Given a list of M attribute IDs, export the tokens to a numpy ndarray - of shape N*M, where N is the length of the sentence. + .. py:method:: __getitem__(self, slice start_colon_end) - Arguments: - attr_ids (list[int]): A list of attribute ID ints. + :returns: Span - Returns: - feat_array (numpy.ndarray[long, ndim=2]): - A feature matrix, with one row per word, and one column per attribute - indicated in the input attr_ids. - -:code:`count_by` - Produce a dict of {attribute (int): count (ints)} frequencies, keyed - by the values of the given attribute ID. + .. py:method:: __iter__(self) - >>> from spacy.en import English, attrs - >>> nlp = English() - >>> tokens = nlp(u'apple apple orange banana') - >>> tokens.count_by(attrs.ORTH) - {12800L: 1, 11880L: 2, 7561L: 1} - >>> tokens.to_array([attrs.ORTH]) - array([[11880], - [11880], - [ 7561], - [12800]]) + Iterate over tokens + + .. code:: -:code:`merge` - Merge a multi-word expression into a single token. Currently - experimental; API is likely to change. + >>> tokens = nlp(u'Zero one two three four five six') + >>> tokens[0].orth_ + u'Zero' + >>> tokens[-1].orth_ + u'six' + .. py:method:: __len__(self) + Number of tokens -Internals - A Tokens instance stores the annotations in a C-array of `TokenC` structs. - Each TokenC struct holds a const pointer to a LexemeC struct, which describes - a vocabulary item. + .. py:attribute:: sents + + Iterate over sentences in the document. - The Token objects are built lazily, from this underlying C-data. + :returns generator: Sentences - For faster access, the underlying C data can be accessed from Cython. You - can also export the data to a numpy array, via `Tokens.to_array`, if pure Python - access is required, and you need slightly better performance. However, this - is both slower and has a worse API than Cython access. + .. py:attribute:: ents + + Iterate over named entities in the document. + + :returns tuple: Named Entities + + .. py:attribute:: noun_chunks + + :returns generator: + + .. py:method:: to_array(self, list attr_ids) + + Given a list of M attribute IDs, export the tokens to a numpy ndarray + of shape N*M, where N is the length of the sentence. + + :param list[int] attr_ids: A list of attribute ID ints. + + :returns feat_array: + A feature matrix, with one row per word, and one column per attribute + indicated in the input attr_ids. + + .. py:method:: count_by(self, attr_id) + + Produce a dict of {attribute (int): count (ints)} frequencies, keyed + by the values of the given attribute ID. + + .. code:: + + >>> from spacy.en import English, attrs + >>> nlp = English() + >>> tokens = nlp(u'apple apple orange banana') + >>> tokens.count_by(attrs.ORTH) + {12800L: 1, 11880L: 2, 7561L: 1} + >>> tokens.to_array([attrs.ORTH]) + array([[11880], + [11880], + [ 7561], + [12800]]) + + .. py:method:: from_array(self, attrs, array) + + .. py:method:: to_bytes(self) + + .. py:method:: from_bytes(self) + + .. py:method:: read_bytes(self) + + .. py:method:: merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type) + + Merge a multi-word expression into a single token. Currently + experimental; API is likely to change. diff --git a/docs/source/reference/using/span.rst b/docs/source/reference/using/span.rst index 3aa19c469..c3c78a68f 100644 --- a/docs/source/reference/using/span.rst +++ b/docs/source/reference/using/span.rst @@ -4,29 +4,55 @@ The Span Object .. autoclass:: spacy.spans.Span -:code:`__getitem__`, :code:`__iter__`, :code:`__len__` - Sequence API +.. py:class:: Span -:code:`head` - Syntactic head, or None -:code:`left` - Tokens to the left of the span + .. py:method:: __getitem__ -:code:`rights` - Tokens to the left of the span + .. py:method:: __iter__ -:code:`orth` / :code:`orth_` - Orth string + .. py:method:: __len__ -:code:`lemma` / :code:`lemma_` - Lemma string + .. py:attribute:: root -:code:`string` - String + Syntactic head -:code:`label` / :code:`label_` - Label + .. py:attribute:: lefts -:code:`subtree` - Lefts + [self] + Rights + Tokens that are: + + 1. To the left of the span; + 2. Syntactic children of words within the span + + i.e. + + .. code:: + + lefts = [span.doc[i] for i in range(0, span.start) if span.doc[i].head in span] + + .. py:attribute:: rights + + Tokens that are: + + 1. To the right of the span; + 2. Syntactic children of words within the span + + i.e. + + .. code:: + + rights = [span.doc[i] for i in range(span.end, len(span.doc)) if span.doc[i].head in span] + + Tokens that are: + + 1. To the right of the span; + 2. Syntactic children of words within the span + + + .. py:attribute:: string + + .. py:attribute:: lemma / lemma\_ + + .. py:attribute:: label / label\_ + + .. py:attribute:: subtree diff --git a/docs/source/reference/using/token.rst b/docs/source/reference/using/token.rst index f32830c84..721e78820 100644 --- a/docs/source/reference/using/token.rst +++ b/docs/source/reference/using/token.rst @@ -11,115 +11,185 @@ token.orth is an integer ID, token.orth\_ is the unicode value. The only exception is the Token.string attribute, which is (unicode) string-typed. -**String Features** -:code:`orth` / :code:`orth_` - The form of the word with no string normalization or processing, as it - appears in the string, without trailing whitespace. +.. py:class:: Token -:code:`lemma` / :code:`lemma_` - The "base" of the word, with no inflectional suffixes, e.g. the lemma of - "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" - is "institution", not "institute". Lemmatization is performed using the - WordNet data, but extended to also cover closed-class words such as - pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". - We assign pronouns the lemma -PRON-. + .. py:method:: __init__(self, Vocab vocab, Doc doc, int offset) -:code:`lower` / :code:`lower_` - The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() + **String Views** -:code:`norm` / :code:`norm_` - The form of the word, after language-specific normalizations have been - applied. + .. py:attribute:: orth / orth\_ -:code:`shape` / :code:`shape_` - A transform of the word's string, to show orthographic features. The - characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. - After these mappings, sequences of 4 or more of the same character are - truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - :) --> :) + The form of the word with no string normalization or processing, as it + appears in the string, without trailing whitespace. -:code:`prefix` / :code:`prefix_` - A length-N substring from the start of the word. Length may vary by - language; currently for English n=1, i.e. prefix = word.orth\_[:1] + .. py:attribute:: lemma / lemma\_ -:code:`suffix` / :code:`suffix_` - A length-N substring from the end of the word. Length may vary by - language; currently for English n=3, i.e. suffix = word.orth\_[-3:] + The "base" of the word, with no inflectional suffixes, e.g. the lemma of + "developing" is "develop", the lemma of "geese" is "goose", etc. Note that + *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" + is "institution", not "institute". Lemmatization is performed using the + WordNet data, but extended to also cover closed-class words such as + pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". + We assign pronouns the lemma -PRON-. -:code:`string` - The form of the word as it appears in the string, **including trailing - whitespace**. This is useful when you need to use linguistic features to - add inline mark-up to the string. + .. py:attribute:: lower / lower\_ + The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() -**Distributional Features** + .. py:attribute:: norm / norm\_ -:code:`prob` - The unigram log-probability of the word, estimated from counts from a - large corpus, smoothed using Simple Good Turing estimation. + The form of the word, after language-specific normalizations have been + applied. -:code:`cluster` - The Brown cluster ID of the word. These are often useful features for - linear models. If you're using a non-linear model, particularly - a neural net or random forest, consider using the real-valued word - representation vector, in Token.repvec, instead. + .. py:attribute:: shape / shape\_ -:code:`repvec` - A "word embedding" representation: a dense real-valued vector that supports - similarity queries between words. By default, spaCy currently loads - vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - model. + A transform of the word's string, to show orthographic features. The + characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. + After these mappings, sequences of 4 or more of the same character are + truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + :) --> :) -**Syntactic Features** + .. py:attribute:: prefix / prefix\_ -:code:`tag` - A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are - language/corpus specific, and typically describe part-of-speech and some - amount of morphological information. For instance, in the Penn Treebank - tag set, VBZ is assigned to a present-tense singular verb. + A length-N substring from the start of the word. Length may vary by + language; currently for English n=1, i.e. prefix = word.orth\_[:1] -:code:`pos` - A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, - ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. + .. py:attribute:: suffix / suffix\_ -:code:`dep` - The type of syntactic dependency relation between the word and its - syntactic head. + A length-N substring from the end of the word. Length may vary by + language; currently for English n=3, i.e. suffix = word.orth\_[-3:] -:code:`n_lefts` - The number of immediate syntactic children preceding the word in the - string. + .. py:attribute:: lex_id -:code:`n_rights` - The number of immediate syntactic children following the word in the - string. + **Alignment and Output** -**Navigating the Dependency Tree** + .. py:attribute:: idx -:code:`head` - The Token that is the immediate syntactic head of the word. If the word is - the root of the dependency tree, the same word is returned. + .. py:method:: __len__(self) -:code:`lefts` - An iterator for the immediate leftward syntactic children of the word. + .. py:method:: __unicode__(self) -:code:`rights` - An iterator for the immediate rightward syntactic children of the word. + .. py:method:: __str__(self) -:code:`children` - An iterator that yields from lefts, and then yields from rights. + .. py:attribute:: string -:code:`subtree` - An iterator for the part of the sentence syntactically governed by the - word, including the word itself. + The form of the word as it appears in the string, **including trailing + whitespace**. This is useful when you need to use linguistic features to + add inline mark-up to the string. + .. py:method:: nbor(self, int i=1) -**Named Entities** + **Distributional Features** -:code:`ent_type` - If the token is part of an entity, its entity type + .. py:attribute:: repvec -:code:`ent_iob` - The IOB (inside, outside, begin) entity recognition tag for the token + A "word embedding" representation: a dense real-valued vector that supports + similarity queries between words. By default, spaCy currently loads + vectors produced by the Levy and Goldberg (2014) dependency-based word2vec + model. + + .. py:attribute:: cluster + + The Brown cluster ID of the word. These are often useful features for + linear models. If you're using a non-linear model, particularly + a neural net or random forest, consider using the real-valued word + representation vector, in Token.repvec, instead. + + .. py:attribute:: prob + + The unigram log-probability of the word, estimated from counts from a + large corpus, smoothed using Simple Good Turing estimation. + + **Navigating the Dependency Tree** + + .. py:attribute:: pos / pos\_ + + A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, + ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. + + .. py:attribute:: tag / tag\_ + + A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are + language/corpus specific, and typically describe part-of-speech and some + amount of morphological information. For instance, in the Penn Treebank + tag set, VBZ is assigned to a present-tense singular verb. + + .. py:attribute:: dep / dep\_ + + The type of syntactic dependency relation between the word and its + syntactic head. + + .. py:attribute:: head + + The Token that is the immediate syntactic head of the word. If the word is + the root of the dependency tree, the same word is returned. + + .. py:attribute:: lefts + + An iterator for the immediate leftward syntactic children of the word. + + .. py:attribute:: rights + + An iterator for the immediate rightward syntactic children of the word. + + .. py:attribute:: n_lefts + + The number of immediate syntactic children preceding the word in the + string. + + .. py:attribute:: n_rights + + The number of immediate syntactic children following the word in the + string. + + .. py:attribute:: children + + An iterator that yields from lefts, and then yields from rights. + + .. py:attribute:: subtree + + An iterator for the part of the sentence syntactically governed by the + word, including the word itself. + + .. py:attribute:: left_edge + + .. py:attribute:: right_edge + + .. py:attribute:: conjuncts + + **Named Entities** + + .. py:attribute:: ent_type + + If the token is part of an entity, its entity type + + .. py:attribute:: ent_iob + + The IOB (inside, outside, begin) entity recognition tag for the token + + **Lexeme Flags** + + .. py:method:: check_flag(self, attr_id_t flag_id) + + .. py:attribute:: is_oov + + .. py:attribute:: is_alpha + + .. py:attribute:: is_ascii + + .. py:attribute:: is_digit + + .. py:attribute:: is_lower + + .. py:attribute:: is_title + + .. py:attribute:: is_punct + + .. py:attribute:: is_space + + .. py:attribute:: like_url + + .. py:attribute:: like_num + + .. py:attribute:: like_email From b27bd18d6efcf96b2567c041663022fc62441e4f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:30:49 +0200 Subject: [PATCH 034/138] * Add spaCy to gazetteer --- lang_data/en/gazetteer.json | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 641b3966d..1aa6b9514 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -62,6 +62,21 @@ [{"lower": "css3"}] ] ], + "displaCy": [ + "PRODUCT", + {}, + [ + [{"lower": "displacy"}] + ] + ], + "spaCy": [ + "PRODUCT", + {}, + [ + [{"orth": "spaCy"}] + ] + ], + "HTML": [ "PRODUCT", {}, @@ -91,11 +106,12 @@ [{"lower": "digg"}] ] ], - "Fox": [ + "FoxNews": [ "ORG", {}, [ - [{"orth": "Fox"}] + [{"orth": "Fox"}], + [{"orth": "News"}] ] ], "Google": [ From 76a1f0481a49c1826ddaeb9c008e5d7e5d8fd766 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:31:54 +0200 Subject: [PATCH 035/138] * Whitespace --- spacy/syntax/arc_eager.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index b297140ba..441c3df22 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -267,7 +267,7 @@ cdef class Break: return cost else: return cost + 1 - + @staticmethod cdef inline int label_cost(StateClass s, const GoldParseC* gold, int label) nogil: return 0 @@ -279,7 +279,7 @@ cdef int _get_root(int word, const GoldParseC* gold) nogil: return -1 else: return word - + cdef class ArcEager(TransitionSystem): @classmethod @@ -322,8 +322,9 @@ cdef class ArcEager(TransitionSystem): cdef Transition lookup_transition(self, object name) except *: if '-' in name: move_str, label_str = name.split('-', 1) - label = self.label_ids[label_str] + label = self.strings[label_str] else: + move_str = name label = 0 move = MOVE_NAMES.index(move_str) for i in range(self.n_moves): From 3af938365faae661867dad0a1991e7d9d0eb770f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:32:15 +0200 Subject: [PATCH 036/138] * Add function partial to Parser --- spacy/syntax/parser.pyx | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 69d70ad03..a45f17339 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,6 +88,22 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) + def partial(self, Doc tokens, initial_actions): + cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) + self.moves.initialize_state(stcls) + cdef object action_name + cdef Transition action + for action_name in initial_actions: + action = self.moves.lookup_transition(action_name) + action.do(stcls, action.label) + + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, + self.model.n_feats, self.model.n_feats) + with nogil: + self.parse(stcls, eg.c) + tokens.set_parse(stcls._sent) + return stcls + cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) @@ -109,9 +125,9 @@ cdef class Parser: cdef Transition G while not stcls.is_final(): memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) - + self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - + fill_context(eg.c.atoms, stcls) self.model.train(eg) From 7bafc789e780164f15cc50fe11627c4503bd9151 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:32:42 +0200 Subject: [PATCH 037/138] * Add stack and queue properties to stateclass, for python access --- spacy/syntax/stateclass.pyx | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index 2a7bcfd7a..d5d86594e 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -36,6 +36,14 @@ cdef class StateClass: self._buffer[i] = i self._empty_token.lex = &EMPTY_LEXEME + @property + def stack(self): + return {self.S(i) for i in range(self._s_i)} + + @property + def queue(self): + return {self.B(i) for i in range(self._b_i, self.length)} + cdef int E(self, int i) nogil: if self._e_i <= 0 or self._e_i >= self.length: return 0 From 2a46c77324f68fb753cfb343e91ea258bb0c2312 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:35:59 +0200 Subject: [PATCH 038/138] * Whitespace --- spacy/syntax/transition_system.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 387cd0fc9..4cf9aae7e 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -47,6 +47,6 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except * cdef int set_valid(self, int* output, StateClass state) nogil - + cdef int set_costs(self, int* is_valid, int* costs, StateClass state, GoldParse gold) except -1 From cc9deae9600456b2cda8c605e1487a8d97778b26 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:36:18 +0200 Subject: [PATCH 039/138] * Add is_valid method to transition_system --- spacy/syntax/transition_system.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 00395333f..86aef1fbc 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -54,6 +54,10 @@ cdef class TransitionSystem: cdef Transition init_transition(self, int clas, int move, int label) except *: raise NotImplementedError + def is_valid(self, StateClass stcls, move_name): + action = self.lookup_transition(move_name) + return action.is_valid(stcls, action.label) + cdef int set_valid(self, int* is_valid, StateClass stcls) nogil: cdef int i for i in range(self.n_moves): From 01be34d55a42743a265bd16307e49d4a9f8f84e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:37:44 +0200 Subject: [PATCH 040/138] * Whitespace --- spacy/tokens/token.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 6aa000f05..f1f2696cb 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -193,7 +193,7 @@ cdef class Token: property left_edge: def __get__(self): return self.doc[self.c.l_edge] - + property right_edge: def __get__(self): return self.doc[self.c.r_edge] @@ -202,7 +202,7 @@ cdef class Token: def __get__(self): """The token predicted by the parser to be the head of the current token.""" return self.doc[self.i + self.c.head] - + property conjuncts: def __get__(self): """Get a list of conjoined words""" @@ -290,7 +290,7 @@ cdef class Token: property is_alpha: def __get__(self): return check_flag(self.c.lex, IS_ALPHA) - + property is_ascii: def __get__(self): return check_flag(self.c.lex, IS_ASCII) @@ -311,7 +311,7 @@ cdef class Token: property like_url: def __get__(self): return check_flag(self.c.lex, LIKE_URL) - + property like_num: def __get__(self): return check_flag(self.c.lex, LIKE_NUM) From c044c0e18eee07ef0a6cbb76dd3debdc0262d292 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:38:19 +0200 Subject: [PATCH 041/138] * Fix partial actions test --- tests/parser/test_initial_actions_parse.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index 2f45a6994..8c844ce13 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -4,5 +4,9 @@ import pytest def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) - EN.parser(doc, initial_actions=['L-nsubj', 'S', 'L-det']) + next_actions = EN.parser.partial(doc, actions=['L-nsubj', 'S', 'L-det']) assert doc[0].head.i == 1 + assert doc[1].head.i == 1 + assert doc[2].head.i == 2 + assert doc[3].head.i == 2 + assert doc From d4e79a5ca2ee5ffb41c32d079a777dcb80a94b82 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:41:36 +0200 Subject: [PATCH 042/138] * Fix test initial parse --- tests/parser/test_initial_actions_parse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index 8c844ce13..7d36d2ab1 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -4,7 +4,7 @@ import pytest def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) - next_actions = EN.parser.partial(doc, actions=['L-nsubj', 'S', 'L-det']) + next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) assert doc[0].head.i == 1 assert doc[1].head.i == 1 assert doc[2].head.i == 2 From 9de218b7ba1f5b12d6e528d5a94567f3d732f7fc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:45:18 +0200 Subject: [PATCH 043/138] * Fix Parser.partial function --- spacy/syntax/parser.pyx | 5 ----- 1 file changed, 5 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index a45f17339..c9e1b0412 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -96,11 +96,6 @@ cdef class Parser: for action_name in initial_actions: action = self.moves.lookup_transition(action_name) action.do(stcls, action.label) - - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, - self.model.n_feats, self.model.n_feats) - with nogil: - self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) return stcls From 0ea08c43197262fdc201fd6de0f3b44f4462cc74 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 8 Aug 2015 23:45:36 +0200 Subject: [PATCH 044/138] * Fix test partial parse --- tests/parser/test_initial_actions_parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index 7d36d2ab1..c1603cd93 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -7,6 +7,6 @@ def test_initial(EN): next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) assert doc[0].head.i == 1 assert doc[1].head.i == 1 - assert doc[2].head.i == 2 - assert doc[3].head.i == 2 + assert doc[2].head.i == 3 + assert doc[3].head.i == 3 assert doc From 0653288fa58c7904356c76b1c7cb5bbaa552c13d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 00:39:02 +0200 Subject: [PATCH 045/138] * Fix stateclass.queue --- spacy/syntax/stateclass.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index d5d86594e..6f7951987 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -42,7 +42,7 @@ cdef class StateClass: @property def queue(self): - return {self.B(i) for i in range(self._b_i, self.length)} + return {self.B(i) for i in range(self._b_i)} cdef int E(self, int i) nogil: if self._e_i <= 0 or self._e_i >= self.length: From 18331dca8970c1cb238fb7a5a1eeb7921d5e7abc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 01:31:54 +0200 Subject: [PATCH 046/138] * Add continue_for argument to parser 'partial' function, which is now renamed to get_state --- spacy/syntax/parser.pyx | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c9e1b0412..80c70c333 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,14 +88,30 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) - def partial(self, Doc tokens, initial_actions): + def get_state(self, Doc tokens, initial_actions, continue_for=0): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) cdef object action_name cdef Transition action + cdef Example eg for action_name in initial_actions: - action = self.moves.lookup_transition(action_name) + try: + action = self.moves.lookup_transition(action_name) + except IndexError: + break action.do(stcls, action.label) + else: + eg = Example(self.model.n_classes, CONTEXT_SIZE, + self.model.n_feats, self.model.n_feats) + while not stcls.is_final() and continue_for != 0: + memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) + self.moves.set_valid(eg.c.is_valid, stcls) + fill_context(eg.c.atoms, stcls) + self.model.set_scores(eg.c.scores, eg.c.atoms) + eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.c.guess].label) + if stcls.is_final(): + self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) return stcls From f0f4fa98385a372e67033e1d74be0d188e8c9d55 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 01:40:13 +0200 Subject: [PATCH 047/138] * Fix Parser.get_state --- spacy/syntax/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 80c70c333..276e40f94 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -109,7 +109,7 @@ cdef class Parser: fill_context(eg.c.atoms, stcls) self.model.set_scores(eg.c.scores, eg.c.atoms) eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) - self.moves.c[eg.guess].do(stcls, self.moves.c[eg.c.guess].label) + self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) if stcls.is_final(): self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) From 55fde0e240ab282e01dd7b1e6b1a49f445205a65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 01:45:30 +0200 Subject: [PATCH 048/138] * Fix get_state --- spacy/syntax/parser.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 276e40f94..45da6402d 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -108,8 +108,8 @@ cdef class Parser: self.moves.set_valid(eg.c.is_valid, stcls) fill_context(eg.c.atoms, stcls) self.model.set_scores(eg.c.scores, eg.c.atoms) - eg.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) - self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) + eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) + self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) if stcls.is_final(): self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) From 04fccfb9848ba2e1f9b5d707df3e04bde90cb8b4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 02:11:22 +0200 Subject: [PATCH 049/138] * Fix get_state for parser prediction --- spacy/syntax/parser.pyx | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 45da6402d..c215bd30b 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,28 +88,24 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) - def get_state(self, Doc tokens, initial_actions, continue_for=0): + def get_state(self, Doc tokens, initial_actions): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) self.moves.initialize_state(stcls) cdef object action_name cdef Transition action - cdef Example eg + cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, + self.model.n_feats, self.model.n_feats) for action_name in initial_actions: - try: - action = self.moves.lookup_transition(action_name) - except IndexError: - break - action.do(stcls, action.label) - else: - eg = Example(self.model.n_classes, CONTEXT_SIZE, - self.model.n_feats, self.model.n_feats) - while not stcls.is_final() and continue_for != 0: + if action_name == '_': memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) self.moves.set_valid(eg.c.is_valid, stcls) fill_context(eg.c.atoms, stcls) self.model.set_scores(eg.c.scores, eg.c.atoms) eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) - self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) + action = self.moves.c[eg.c.guess] + else: + action = self.moves.lookup_transition(action_name) + action.do(stcls, action.label) if stcls.is_final(): self.moves.finalize_state(stcls) tokens.set_parse(stcls._sent) From 9c090945e0586a56e8f7b448837051f8bf48097b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 02:29:58 +0200 Subject: [PATCH 050/138] * Add Parser.predict method, and clean up Parser.get_state --- spacy/syntax/parser.pxd | 1 + spacy/syntax/parser.pyx | 23 +++++++++-------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 411172eda..4ee30341a 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -16,3 +16,4 @@ cdef class Parser: cdef readonly TransitionSystem moves cdef void parse(self, StateClass stcls, ExampleC eg) nogil + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index c215bd30b..bb7e2d96a 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -97,11 +97,7 @@ cdef class Parser: self.model.n_feats, self.model.n_feats) for action_name in initial_actions: if action_name == '_': - memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) - self.moves.set_valid(eg.c.is_valid, stcls) - fill_context(eg.c.atoms, stcls) - self.model.set_scores(eg.c.scores, eg.c.atoms) - eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.model.n_classes) + self.predict(stcls, &eg.c) action = self.moves.c[eg.c.guess] else: action = self.moves.lookup_transition(action_name) @@ -111,13 +107,16 @@ cdef class Parser: tokens.set_parse(stcls._sent) return stcls + cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: + memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) + self.moves.set_valid(eg.is_valid, stcls) + fill_context(eg.atoms, stcls) + self.model.set_scores(eg.scores, eg.atoms) + eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes) + cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): - memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) - self.moves.set_valid(eg.is_valid, stcls) - fill_context(eg.atoms, stcls) - self.model.set_scores(eg.scores, eg.atoms) - eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes) + self.predict(stcls, &eg) self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.finalize_state(stcls) @@ -132,13 +131,9 @@ cdef class Parser: cdef Transition G while not stcls.is_final(): memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t)) - self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold) - fill_context(eg.c.atoms, stcls) - self.model.train(eg) - G = self.moves.c[eg.c.guess] self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) From fe43f8cf39b6392f6c2d58c2979a6100bf12869a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 9 Aug 2015 02:31:53 +0200 Subject: [PATCH 051/138] * Whitespace --- spacy/syntax/ner.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index c569bfa43..8414456b6 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -47,6 +47,7 @@ cdef bint _entity_is_sunk(StateClass st, Transition* golds) nogil: else: return False + cdef class BiluoPushDown(TransitionSystem): @classmethod def get_labels(cls, gold_tuples): From 9de98f5a6fc5d5c012c00fa9278dca21a6beb489 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Aug 2015 00:08:46 +0200 Subject: [PATCH 052/138] * Add Parser.stepthrough method, with context manager --- spacy/syntax/parser.pyx | 80 +++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 19 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index bb7e2d96a..d53a1959a 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -88,25 +88,6 @@ cdef class Parser: self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) - def get_state(self, Doc tokens, initial_actions): - cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) - self.moves.initialize_state(stcls) - cdef object action_name - cdef Transition action - cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, - self.model.n_feats, self.model.n_feats) - for action_name in initial_actions: - if action_name == '_': - self.predict(stcls, &eg.c) - action = self.moves.c[eg.c.guess] - else: - action = self.moves.lookup_transition(action_name) - action.do(stcls, action.label) - if stcls.is_final(): - self.moves.finalize_state(stcls) - tokens.set_parse(stcls._sent) - return stcls - cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: memset(eg.scores, 0, eg.nr_class * sizeof(weight_t)) self.moves.set_valid(eg.is_valid, stcls) @@ -139,3 +120,64 @@ cdef class Parser: self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label) loss += eg.c.loss return loss + + def step_through(self, Doc doc): + return StepwiseState(self, doc) + + +cdef class StepwiseState: + cdef readonly StateClass stcls + cdef readonly Example eg + cdef readonly Doc doc + cdef readonly Parser parser + + def __init__(self, Parser parser, Doc doc): + self.parser = parser + self.doc = doc + self.stcls = StateClass.init(doc.data, doc.length) + self.parser.moves.initialize_state(self.stcls) + self.eg = Example(self.parser.model.n_classes, CONTEXT_SIZE, + self.parser.model.n_feats, self.parser.model.n_feats) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + self.finish() + + @property + def is_final(self): + return self.stcls.is_final() + + @property + def stack(self): + return self.stcls.stack + + @property + def queue(self): + return self.stcls.queue + + @property + def heads(self): + return [self.stcls.H(i) for i in range(self.stcls.length)] + + @property + def deps(self): + return [self.doc.vocab.strings[self.stcls._sent[i].dep] + for i in range(self.stcls.length)] + + def predict(self): + self.parser.predict(self.stcls, &self.eg.c) + action = self.parser.moves.c[self.eg.c.guess] + return self.parser.moves.move_name(action.move, action.label) + + def transition(self, action_name): + if action_name == '_': + action_name = self.predict() + action = self.parser.moves.lookup_transition(action_name) + action.do(self.stcls, action.label) + + def finish(self): + if self.stcls.is_final(): + self.parser.moves.finalize_state(self.stcls) + self.doc.set_parse(self.stcls._sent) From 2c9753eff2b64a060a0cdd8a0c26f3873c4ad033 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Aug 2015 00:09:02 +0200 Subject: [PATCH 053/138] * Whitespace --- spacy/syntax/arc_eager.pyx | 2 +- spacy/syntax/stateclass.pxd | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 441c3df22..265018920 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -109,7 +109,7 @@ cdef bint label_is_gold(const GoldParseC* gold, int head, int child, int label) cdef bint _is_gold_root(const GoldParseC* gold, int word) nogil: return gold.labels[word] == -1 or gold.heads[word] == word - + cdef class Shift: @staticmethod diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd index 888b01c32..8a10f5a39 100644 --- a/spacy/syntax/stateclass.pxd +++ b/spacy/syntax/stateclass.pxd @@ -71,7 +71,6 @@ cdef class StateClass: return -1 return self._sent[i].head + i - cdef int E(self, int i) nogil cdef int R(self, int i, int idx) nogil From 6116413b47277aed5c22df7539497bcfdaee67e1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Aug 2015 05:05:31 +0200 Subject: [PATCH 054/138] * Fix label prediction in StepwiseState --- spacy/syntax/parser.pyx | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index d53a1959a..ed89d4cc8 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -172,12 +172,32 @@ cdef class StepwiseState: return self.parser.moves.move_name(action.move, action.label) def transition(self, action_name): + moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} if action_name == '_': action_name = self.predict() - action = self.parser.moves.lookup_transition(action_name) + if action_name == 'L' or action_name == 'R': + self.predict() + move = moves[action_name] + clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, + self.eg.c.nr_class) + action = self.parser.moves.c[clas] + else: + action = self.parser.moves.lookup_transition(action_name) action.do(self.stcls, action.label) def finish(self): if self.stcls.is_final(): self.parser.moves.finalize_state(self.stcls) self.doc.set_parse(self.stcls._sent) + + +cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actions, + int nr_class) except -1: + cdef weight_t score = 0 + cdef int mode = -1 + cdef int i + for i in range(nr_class): + if actions[i].move == move and (mode == -1 or scores[i] >= score): + mode = actions[i].clas + score = scores[i] + return mode From bf38b3b883276d4181101b2b553432d237201817 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Aug 2015 05:58:43 +0200 Subject: [PATCH 055/138] * Hack on l/r reversal bug --- spacy/syntax/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index ed89d4cc8..6282339bd 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -175,7 +175,8 @@ cdef class StepwiseState: moves = {'S': 0, 'D': 1, 'L': 2, 'R': 3} if action_name == '_': action_name = self.predict() - if action_name == 'L' or action_name == 'R': + action = self.parser.moves.lookup_transition(action_name) + elif action_name == 'L' or action_name == 'R': self.predict() move = moves[action_name] clas = _arg_max_clas(self.eg.c.scores, move, self.parser.moves.c, @@ -198,6 +199,6 @@ cdef int _arg_max_clas(const weight_t* scores, int move, const Transition* actio cdef int i for i in range(nr_class): if actions[i].move == move and (mode == -1 or scores[i] >= score): - mode = actions[i].clas + mode = i score = scores[i] return mode From a3290a581027f623f7b6d49d8a7f4aa50001aa1b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 00:58:22 +0200 Subject: [PATCH 056/138] * Work on landing page in jade --- docs/redesign/spacy_home.jade | 263 ++++++++++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 docs/redesign/spacy_home.jade diff --git a/docs/redesign/spacy_home.jade b/docs/redesign/spacy_home.jade new file mode 100644 index 000000000..8e548347f --- /dev/null +++ b/docs/redesign/spacy_home.jade @@ -0,0 +1,263 @@ +- var slogan = "Build Tomorrow's Language Technologies" +- var tag_line = "spaCy – #{slogan}" +- var a_minor_miracle = 'a minor miracle' + +mixin lede() + p. + spaCy is a library for industrial-strength NLP in Python and + Cython. It features state-of-the-art speed and accuracy, a concise API, and + great documentation. If you're a small company doing NLP, we want spaCy to + seem like !{a_minor_miracle}. + +mixin overview() + p. + Overview text + +mixin example() + p. + Example text + +mixin benchmarks() + p. + Benchmarks + +mixin get_started() + p. + Get Started + +mixin example(name) + details + summary + span(class="example-name")= name + + block + + +mixin accuracy_head + tr + +mixin columns(...names) + tr + each name in names + th= name + + +mixin row(...cells) + tr + each cell in cells + td= cell + + +doctype html +html(lang="en") + head + meta(charset="utf-8") + title!= tag_line + meta(name="description" content="") + meta(name="author" content="Matthew Honnibal") + link(rel="stylesheet" href="css/style.css") + + + body(id="page" role="document") + header(role="banner") + h1(class="logo")!= tag_line + div(class="slogan")!= slogan + + nav(role="navigation") + ul + li: a(href="#") Home + li: a(href="#") Docs + li: a(href="#") License + li: a(href="#") Blog + + main(id="content" role="main") + section(class="intro") + +lede + + nav(role="navigation") + ul + li: a(href="#overview" class="button") Examples + li: a(href="#overview" class="button") Comparisons + li: a(href="#example-use" class="button") Demo + li: a(href="#get-started" class="button") Install + + article(class="page landing-page") + a(name="example-use"): h3 Usage by Example + + +example("Load resources and process text") + pre.language-python + code + | from __future__ import unicode_literals, print_function + | from spacy.en import English + | nlp = English() + | doc = nlp('Hello, world. Here are two sentences.') + + +example("Get tokens and sentences") + pre.language-python + code + | token = doc[0] + | sentence = doc.sents[0] + | assert token[0] is sentence[0] + + +example("Use integer IDs for any string") + pre.language-python + code + | hello_id = nlp.vocab.strings['Hello'] + | hello_str = nlp.vocab.strings[hello_id] + | + | assert token.orth == hello_id == 52 + | assert token.orth_ == hello_str == 'Hello' + + +example("Get and set string views and flags") + pre.language-python + code + | assert token.shape_ == 'Xxxx' + | for lexeme in nlp.vocab: + | if lexeme.is_alpha: + | lexeme.shape_ = 'W' + | elif lexeme.is_digit: + | lexeme.shape_ = 'D' + | elif lexeme.is_punct: + | lexeme.shape_ = 'P' + | else: + | lexeme.shape_ = 'M' + | assert token.shape_ == 'W' + + +example("Export to numpy arrays") + pre.language-python + code + | Do me + + +example("Word vectors") + pre.language-python + code + | Do me + + +example("Part-of-speech tags") + pre.language-python + code + | Do me + + +example("Syntactic dependencies") + pre.language-python + code + | Do me + + +example("Named entities") + pre.language-python + code + | Do me + + +example("Define custom NER rules") + pre.language-python + code + | Do me + + +example("Calculate inline mark-up on original string") + pre.language-python + code + | Do me + + +example("Efficient binary serialization") + pre.language-python + code + | Do me + + a(name="benchmarks"): h3 Benchmarks + + details + summary: h4 Independent Evaluation + + p + | Independent evaluation by Yahoo! Labs and Emory + | University, to appear at ACL 2015. Higher is better. + + table + thead + +columns("System", "Language", "Accuracy", "Speed") + + tbody + +row("spaCy v0.86", "Cython", "91.9", "13,963") + +row("spaCy v0.84", "Cython", "90.6", "13,963") + +row("ClearNLP", "Java", "91.7", "10,271") + +row("CoreNLP", "Java", "89.6", "8,602") + +row("MATE", "Java", "92.5", "550") + +row("Turbo", "C++", "92.4", "349") + +row("Yara", "Java", "92.3", "340") + + p + | Accuracy is % unlabelled arcs correct, speed is tokens per second. + + p + | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) + | performed a detailed comparison of the best parsers available. + | All numbers above are taken from the pre-print they kindly made + | available to me, except for spaCy v0.86. + + p + | I'm particularly grateful to the authors for discussion of their + | results, which led to the improvement in accuracy between v0.84 and + | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly + | useful. + + details + summary: h4 Detailed Accuracy Comparison + + details + summary: h4 Detailed Speed Comparison + + table + thead + tr + th. + th(colspan=3) Absolute (ms per doc) + th(colspan=3) Relative (to spaCy) + + tbody + tr + td: strong System + td: strong Split + td: strong Tag + td: strong Parse + td: strong Split + td: strong Tag + td: strong Parse + + +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") + +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") + +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") + +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") + +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") + + p + | Set up: 100,000 plain-text documents were streamed + | from an SQLite3 database, and processed with an NLP library, to one + | of three levels of detail – tokenization, tagging, or parsing. + | The tasks are additive: to parse the text you have to tokenize and + | tag it. The pre-processing was not subtracted from the times – + | I report the time required for the pipeline to complete. I report + | mean times per document, in milliseconds. + + p + | Hardware: Intel i7-3770 (2012) + + + //+comparison("spaCy vs. NLTK") + //+comparison("spaCy vs. Pattern") + //+comparison("spaCy vs. CoreNLP") + //+comparison("spaCy vs. ClearNLP") + //+comparison("spaCy vs. OpenNLP") + //+comparison("spaCy vs. GATE") + + a(name="get-started"): h3 Get started + + +get_started + + + + + footer(role="contentinfo") + + script(src="js/prism.js") From 8a8da6118e8be711366b0a738692712ee0c28a69 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 00:59:48 +0200 Subject: [PATCH 057/138] * Work on landing page in jade --- docs/redesign/spacy_home.jade | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/redesign/spacy_home.jade b/docs/redesign/spacy_home.jade index 8e548347f..688863923 100644 --- a/docs/redesign/spacy_home.jade +++ b/docs/redesign/spacy_home.jade @@ -32,10 +32,14 @@ mixin example(name) block +mixin comparison(name) + details + summary + h4 + name -mixin accuracy_head - tr - + block + mixin columns(...names) tr each name in names @@ -167,6 +171,13 @@ html(lang="en") a(name="benchmarks"): h3 Benchmarks + +comparison("spaCy vs. NLTK") + +comparison("spaCy vs. Pattern") + +comparison("spaCy vs. CoreNLP") + +comparison("spaCy vs. ClearNLP") + +comparison("spaCy vs. OpenNLP") + +comparison("spaCy vs. GATE") + details summary: h4 Independent Evaluation @@ -243,14 +254,6 @@ html(lang="en") p | Hardware: Intel i7-3770 (2012) - - //+comparison("spaCy vs. NLTK") - //+comparison("spaCy vs. Pattern") - //+comparison("spaCy vs. CoreNLP") - //+comparison("spaCy vs. ClearNLP") - //+comparison("spaCy vs. OpenNLP") - //+comparison("spaCy vs. GATE") - a(name="get-started"): h3 Get started +get_started From ab39f358c17a7553f427fb69b67f93367e5fd2dd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 08:24:37 +0200 Subject: [PATCH 058/138] * Add docs.jade file --- docs/redesign/spacy_docs.jade | 563 ++++++++++++++++++++++++++++++++++ 1 file changed, 563 insertions(+) create mode 100644 docs/redesign/spacy_docs.jade diff --git a/docs/redesign/spacy_docs.jade b/docs/redesign/spacy_docs.jade new file mode 100644 index 000000000..5b64dd0da --- /dev/null +++ b/docs/redesign/spacy_docs.jade @@ -0,0 +1,563 @@ +- var unicode_type = 'unicode' +- var bool_type = 'bool' + +- var int_type = "" + +- var Token_type = "" +- var Span_type = "" +- var Vocab_type = "" +- var generator_type = "" + + + +mixin declare_class(name) + details(open="true") + summary + span.declaration + span.label class + code #{name} + block + +mixin method(name, parameters) + details(open=attributes.open) + summary + span.declaration + span.label #{name} + span.parameters + | self, #{parameters} + block + + +mixin params + ul + block + + +mixin param(name, type, value) + li + if type + #{name} (!{type}) – + else + #{name} – + block + + +mixin attribute(name, type, value) + details(open=attributes.open) + summary + span.declaration + span.label #{name} + block + + +mixin returns(name, type, value) + li + if type + #{name} (!{type}) – + else + #{name} – + block + + +mixin returns(type) + | tmp + + + + +doctype html +html(lang="en") + head + meta(charset="utf-8") + title!= tag_line + meta(name="description" content="") + meta(name="author" content="Matthew Honnibal") + link(rel="stylesheet" href="css/style.css") + + + body(id="docs") + header + h1.logo!= tag_line + div.slogan!= slogan + + + nav(role="navigation") + ul + li: a(href="#") Home + li.active: a(href="#") Docs + li: a(href="#") License + li: a(href="#") Blog + + main.docs#content + section.intro + | Tmp + + article + h3: a(href="#") Header + + +declare_class("spacy.en.English") + +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True") + + +params + +param("data_dir") + | The data directory. May be #{None}, to disable any data loading + | (including the vocabulary). + + +param("Tokenizer") + | A class/function that creates the tokenizer. + + +param("Tagger") + | A class/function that creates the part-of-speech tagger. + + +param("Parser") + | A class/function that creates the dependency parser. + + +param("Entity") + | A class/function that creates the named entity recogniser. + + +param("load_vectors") + | A boolean value to control whether the word vectors are loaded. + + + +method("__call__", "text, tag=True, parse=True, entity=True")(open) + + +params + +param("text", unicode_type) + | The text to be processed. No pre-processing needs to be applied, + | and any length of text can be submitted. Usually you will submit + | a whole document. Text may be zero-length. An exception is raised + | if byte strings are supplied. + + +param("tag", bool_type) + | Whether to apply the part-of-speech tagger. Required for parsing + | and entity recognition. + + +param("parse", bool_type) + | Whether to apply the syntactic dependency parser. + + +param("entity", bool_type) + | Whether to apply the named entity recognizer. + + pre.language-python + code + | from spacy.en import English + | nlp = English() + | doc = nlp(u'Some text.) # Applies tagger, parser, entity + | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser + | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity + | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser + | doc = nlp(u'') # Zero-length tokens, not an error + | # doc = nlp(b'Some text') <-- Error: need unicode + | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. + + +declare_class("spacy.tokens.doc.Doc") + +method("__init__", "vocab") + +params + +param("vocab", vocab_type) + | A vocabulary object + + +method("__getitem__", "i", int_type) + +returns(Token_type) + + +method("__getitem__", "start_end", slice_type) + +returns(Span_type) + + +method("__iter__") + | Iterate over tokens + + +method("__len__") + | Number of tokens in the document. + + +attribute("sents", generator_type) + | Iterate over sentences in the document. + + +attribute("ents", generator_type) + | Iterate over named entities in the document. + + +attribute("noun_chunks", generator_type) + + +method("to_array", "attr_ids") + + | Given a list of M attribute IDs, export the tokens to a numpy ndarray + | of shape N*M, where N is the length of the sentence. + + +params + + +param("attr_ids", "list[int]") + | A list of attribute ID ints. + + +returns("feat_array") + | A feature matrix, with one row per word, and one column per attribute + | indicated in the input attr_ids. + + +method("count_by", "attr_id") + + | Produce a dict of {attribute (int): count (ints)} frequencies, keyed + | by the values of the given attribute ID. + + pre.language-python + code + | >>> from spacy.en import English, attrs + | >>> nlp = English() + | >>> tokens = nlp(u'apple apple orange banana') + | >>> tokens.count_by(attrs.ORTH) + | {12800L: 1, 11880L: 2, 7561L: 1} + | >>> tokens.to_array([attrs.ORTH]) + | array([[11880], + | [11880], + | [7561], + | [12800]]) + + +method("from_array", "attrs, array") + | Load from array + + +method("to_bytes") + | Serialize + + +method("from_bytes") + | Deserialize, loading from bytes + + +method("read_bytes") + | classmethod + + +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") + + | Merge a multi-word expression into a single token. Currently + | experimental; API is likely to change. + + + +declare_class("spacy.tokens.Token") + +method("__init__", "vocab, doc, offset") + +params + +param("vocab", Vocab_type) + p A Vocab object + + +param("doc", Doc_type) + p The parent sequence + + +param("offset", Int_type) + p The index of the token within the document + + details + summary: h4 String Views + + +attribute("orth / orth_") + | The form of the word with no string normalization or processing, as + | it appears in the string, without trailing whitespace. + + +attribute("lemma / lemma_") + | The "base" of the word, with no inflectional suffixes, e.g. the lemma of + | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that + | derivational suffixes are not stripped, e.g. the lemma of + | "instutitions" is "institution", not "institute". Lemmatization is + | performed using the WordNet data, but extended to also cover closed-class + | words such as pronouns. By default, the WN lemmatizer returns "hi" + | as the lemma of "his". We assign pronouns the lemma -PRON-. + + +attribute("lower / lower_") + | The form of the word, but forced to lower-case, i.e. + pre.language-python: code lower = word.orth\_.lower() + + //+attribute("norm / norm_") + // | The form of the word, after language-specific normalizations has been + // | applied. + + +attribute("shape / shape_") + | A transform of the word's string, to show orthographic features. + | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped + | to d. After these mappings, sequences of 4 or more of the same character + | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + | :) --> :) + + +attribute("prefix / prefix_") + | A length-N substring from the start of the word. Length may vary by + | language; currently for English n=1, i.e. + pre.language-python: code prefix = word.orth\_[:1] + + +attribute("suffix / suffix_") + | A length-N substring from the end of the word. Length may vary by + | language; currently for English n=3, i.e. + pre.language-python: code suffix = word.orth\_[-3:] + + //+attribute("lex_id") + // | lex_id + + details + summary: h4 Alignment and Output + + +attribute("idx") + p Start index of the token in the string + + +method("__len__", "") + p Length of the token's orth string, in unicode code-points. + + +method("__unicode__", "") + p Same as token.orth_ + + +method("__str__", "") + p Varies between Python 2 and Python 3 + + +attribute("string") + p + | The form of the word as it appears in the string, including + | trailing whitespace. This is useful when you need to use + | linguistic features to add inline mark-up to the string. + + +method("nbor, i=1") + +params + +param("i") + p Offset relative to token + + details + summary: h4 Distributional Features + + +attribute("repvec") + p + | A "word embedding" representation: a dense real-valued vector that supports + | similarity queries between words. By default, spaCy currently loads + | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec + | model. + + +attribute("cluster") + p + | The Brown cluster ID of the word. These are often useful features for + | linear models. If you're using a non-linear model, particularly a + | neural net or random forest, consider using the real-valued word + | representation vector, in Token.repvec, instead. + + +attribute("prob") + p + | The unigram log-probability of the word, estimated from counts from a + | large corpus, smoothed using Simple Good Turing estimation. + + details + summary: h4 Syntactic Tags + + +attribute("pos / pos_") + | A part-of-speech tag, from the Google Universal Tag Set, e.g. + | code>NOUN, VERB, ADV. Constants for + | the 17 tag values are provided in spacy.parts_of_speech. + + +attribute("tag / tag_") + | A morphosyntactic tag, e.g. NN, VBZ, + | DT, etc. These tags are language/corpus specific, and + | typically describe part-of-speech and some amount of morphological + | information. For instance, in the Penn Treebank tag set, VBZ + | is assigned to a present-tense singular verb. + + +attribute("dep / dep_") + | The type of syntactic dependency relation between the word and its + | syntactic head. + + details + summary: h4 Navigating the Parse Tree + + +attribute("head") + p + | The Token that is the immediate syntactic head of the word. If the + | word is the root of the dependency tree, the same word is returned. + + +attribute("lefts") + p + | An iterator for the immediate leftward syntactic children of the + | word. + + +attribute("rights") + p + | An iterator for the immediate rightward syntactic children of the + | word. + + +attribute("n_lefts") + p + | The number of immediate syntactic children preceding the word in + | the string. + + +attribute("n_rights") + p + | The number of immediate syntactic children following the word in + | the string. + + +attribute("children") + p + | An iterator that yields from lefts, and then yields from rights. + + +attribute("subtree") + p + | An iterator for the part of the sentence syntactically governed by + | the word, including the word itself. + + +attribute("left_edge") + p The leftmost edge of the token's subtree + + +attribute("right_edge") + p The rightmost edge of the token's subtree + + details + summary: h4 Named Entities + + +attribute("ent_type") + p If the token is part of an entity, its entity type. + + +attribute("ent_iob") + p The IOB (inside, outside, begin) entity recognition tag for the token. + + details + summary: h4 Lexeme Flags + + +method("check_flag", "flag_id") + +params + +param("flag_id") + | flag ID + + +attribute("is_oov") + +attribute("is_alpha") + +attribute("is_ascii") + +attribute("is_digit") + +attribute("is_lower") + +attribute("is_title") + +attribute("is_punct") + +attribute("is_space") + +attribute("like_url") + +attribute("like_num") + +attribute("like_email") + + //+attribute("conjuncts") + // | Conjuncts + + +declare_class("spacy.tokens.span.Span") + +params + +method("__getitem__") + p Get item + + +method("__iter__") + p Iter + + +method("__len__") + p Len + + +attribute("root") + p Syntactic head + + +attribute("lefts") + p Tokens that are: + ol + li To the left of the span; + li Syntactic children of words within the span + + p i.e. + + pre.language-python + code + | lefts = [span.doc[i] for i in range(0, span.start) + | if span.doc[i].head in span] + + +attribute("rights") + p Tokens that are: + ol + li To the right of the span; + li Syntactic children of words within the span + p i.e. + pre.language-python + code + | rights = [span.doc[i] for i in range(span.end, len(span.doc)) + | if span.doc[i].head in span] + + +attribute("string") + p String + + +attribute("lemma / lemma_") + p String + + +attribute("label / label_") + p String + + +attribute("subtree") + p String + + +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None") + +method("__len__") + +returns + p Number of words in the vocabulary. + + +method("__getitem__", "key_int") + +params + +param("key") + p Integer ID + + +returns: p A Lexeme object + + +method("__getitem__", "key_str") + +params + +param("key_str", unicode_type) + p A string in the vocabulary + + +returns("Lexeme") + + +method("__setitem__", "orth_str", "props") + +params + +param("orth_str", unicode_type) + p The orth key + + +param("props", dict_type) + p A props dictionary + + +returns("None") + + +method("dump", "loc") + +params + +param("loc", unicode_type) + p Path where the vocabulary should be saved + + +method("load_lexemes", "loc") + +params + +param("loc", unicode_type) + p Path to load the lexemes.bin file from + + +method("load_vectors", "loc") + +params + +param("loc", unicode_type) + p Path to load the vectors.bin from + + + +declare_class("spacy.strings.StringStore") + +method("__len__") + +returns("int") + p Number of strings in the string-store + + +method("__getitem__", "key_int") + +params + +param("key_int") + p An integer key + + +returns(unicode_type) + p The string that the integer key maps to + + +method("__getitem__", "key_unicode") + +params + +param("key_unicode") + p A key, as a unicode string + + +returns(int_type) + p The integer ID of the string. + + +method("__getitem__", "key_utf8_bytes") + +params + +param("key_utf8_bytes", bytes_type) + p p A key, as a UTF-8 encoded byte-string + + +returns(int_type) + p The integer ID of the string. + + +method("dump", "loc") + +params + +param("loc") + p File path to save the strings.txt to. + + +method("load") + +params + +param("loc") + p File path to load the strings.txt from. + + script(src="js/prism.js") From c767ab9fdfdeaa513040fbf8c16d454ee2dacf54 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 20:21:26 +0200 Subject: [PATCH 059/138] * Work on documentation. Have overall structure now --- docs/redesign/spacy_docs.jade | 976 +++++++++++++++++++--------------- 1 file changed, 559 insertions(+), 417 deletions(-) diff --git a/docs/redesign/spacy_docs.jade b/docs/redesign/spacy_docs.jade index 5b64dd0da..29f0512e7 100644 --- a/docs/redesign/spacy_docs.jade +++ b/docs/redesign/spacy_docs.jade @@ -1,17 +1,19 @@ -- var unicode_type = 'unicode' -- var bool_type = 'bool' - -- var int_type = "" - -- var Token_type = "" -- var Span_type = "" -- var Vocab_type = "" -- var generator_type = "" +- var py_docs = 'unicode', + 'bool': py_docs + 'functions.html#bool">bool', + 'int': py_docs + 'functions.html#int">int', + 'generator': "", + 'Vocab': "", + 'Span': "", + 'Doc': "" + } mixin declare_class(name) - details(open="true") + details summary span.declaration span.label class @@ -62,14 +64,54 @@ mixin returns(name, type, value) mixin returns(type) | tmp +mixin init + details + summary: h4 Init + block + + +mixin callable + details + summary: h4 Callable + + block + + +mixin sequence + details + summary: h4 Sequence + + block + + +mixin maptype + details + summary: h4 Map + + block + + +mixin summary + block + +mixin en_example + pre.language-python + code + | from spacy.en import English + | from spacy._doc_examples import download_war_and_peace + | + | unprocessed_unicode = download_war_and_peace() + | + | nlp = English() + | doc = nlp(unprocessed_unicode) doctype html html(lang="en") head meta(charset="utf-8") - title!= tag_line + title spaCy – Industrial-strength NLP meta(name="description" content="") meta(name="author" content="Matthew Honnibal") link(rel="stylesheet" href="css/style.css") @@ -78,9 +120,9 @@ html(lang="en") body(id="docs") - header - h1.logo!= tag_line - div.slogan!= slogan + header(role="banner") + h1.logo spaCy – Industrial-strength NLP + div.slogan API nav(role="navigation") @@ -91,473 +133,573 @@ html(lang="en") li: a(href="#") Blog main.docs#content - section.intro - | Tmp article - h3: a(href="#") Header + +declare_class("English") + p Load models into a callable object to process English text. - +declare_class("spacy.en.English") - +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True") + +summary + +en_example - +params - +param("data_dir") - | The data directory. May be #{None}, to disable any data loading - | (including the vocabulary). + +init + p + | Load the resources. Loading takes 20 seconds, and the instance + | consumes 2 to 3 gigabytes of memory. + + p + | Intended use is for one instance to be created per process. + | You can create more if you're doing something unusual. + p + | You may wish to make the instance a global variable or "singleton". + | We usually instantiate the object in the main() + | function and pass it around as an explicit argument. + +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") - +param("Tokenizer") - | A class/function that creates the tokenizer. + +params + +param("data_dir") + | The data directory. May be #{None}, to disable any data loading + | (including the vocabulary). - +param("Tagger") - | A class/function that creates the part-of-speech tagger. + +param("Tokenizer") + | A class/function that creates the tokenizer. - +param("Parser") - | A class/function that creates the dependency parser. + +param("Tagger") + | A class/function that creates the part-of-speech tagger. - +param("Entity") - | A class/function that creates the named entity recogniser. + +param("Parser") + | A class/function that creates the dependency parser. - +param("load_vectors") - | A boolean value to control whether the word vectors are loaded. + +param("Entity") + | A class/function that creates the named entity recogniser. + +param("load_vectors") + | A boolean value to control whether the word vectors are loaded. + + +callable + +method("__call__", "text, tag=True, parse=True, entity=True") - +method("__call__", "text, tag=True, parse=True, entity=True")(open) + +params + +param("text", types.unicode) + | The text to be processed. No pre-processing needs to be applied, + | and any length of text can be submitted. Usually you will submit + | a whole document. Text may be zero-length. An exception is raised + | if byte strings are supplied. - +params - +param("text", unicode_type) - | The text to be processed. No pre-processing needs to be applied, - | and any length of text can be submitted. Usually you will submit - | a whole document. Text may be zero-length. An exception is raised - | if byte strings are supplied. + +param("tag", bool_type) + | Whether to apply the part-of-speech tagger. Required for parsing + | and entity recognition. - +param("tag", bool_type) - | Whether to apply the part-of-speech tagger. Required for parsing - | and entity recognition. + +param("parse", bool_type) + | Whether to apply the syntactic dependency parser. - +param("parse", bool_type) - | Whether to apply the syntactic dependency parser. + +param("entity", bool_type) + | Whether to apply the named entity recognizer. - +param("entity", bool_type) - | Whether to apply the named entity recognizer. + pre.language-python + code + | from spacy.en import English + | nlp = English() + | doc = nlp(u'Some text.) # Applies tagger, parser, entity + | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser + | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity + | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser + | doc = nlp(u'') # Zero-length tokens, not an error + | # doc = nlp(b'Some text') <-- Error: need unicode + | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. + + +declare_class("Doc") + p I'm a doc + + +init + +method("__init__", "vocab") + +params + +param("vocab", vocab_type) + | A vocabulary object + + +sequence + +method("__getitem__", "i", types.int) + +returns(types.Token) + + +method("__getitem__", "start_end", types.slice) + +returns(types.Span) + + +method("__iter__") + | Iterate over tokens + + +method("__len__") + | Number of tokens in the document. + + details + summary: h4 Spans + + +attribute("sents", types.generator) + | Iterate over sentences in the document. + + +attribute("ents", types.generator) + | Iterate over named entities in the document. + + +attribute("noun_chunks", types.generator) + + details + summary: h4 Export/Import + + +method("to_array", "attr_ids") + + | Given a list of M attribute IDs, export the tokens to a numpy ndarray + | of shape N*M, where N is the length of the sentence. + + +params + +param("attr_ids", "list[int]") + | A list of attribute ID ints. + + +returns("feat_array") + | A feature matrix, with one row per word, and one column per attribute + | indicated in the input attr_ids. + + +method("count_by", "attr_id") + | Produce a dict of {attribute (int): count (ints)} frequencies, keyed + | by the values of the given attribute ID. + pre.language-python code - | from spacy.en import English - | nlp = English() - | doc = nlp(u'Some text.) # Applies tagger, parser, entity - | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - | doc = nlp(u'') # Zero-length tokens, not an error - | # doc = nlp(b'Some text') <-- Error: need unicode - | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - +declare_class("spacy.tokens.doc.Doc") - +method("__init__", "vocab") - +params - +param("vocab", vocab_type) - | A vocabulary object - - +method("__getitem__", "i", int_type) - +returns(Token_type) - - +method("__getitem__", "start_end", slice_type) - +returns(Span_type) - - +method("__iter__") - | Iterate over tokens - - +method("__len__") - | Number of tokens in the document. - - +attribute("sents", generator_type) - | Iterate over sentences in the document. - - +attribute("ents", generator_type) - | Iterate over named entities in the document. + | >>> from spacy.en import English, attrs + | >>> nlp = English() + | >>> tokens = nlp(u'apple apple orange banana') + | >>> tokens.count_by(attrs.ORTH) + | {12800L: 1, 11880L: 2, 7561L: 1} + | >>> tokens.to_array([attrs.ORTH]) + | array([[11880], + | [11880], + | [7561], + | [12800]]) - +attribute("noun_chunks", generator_type) - - +method("to_array", "attr_ids") - - | Given a list of M attribute IDs, export the tokens to a numpy ndarray - | of shape N*M, where N is the length of the sentence. - - +params - - +param("attr_ids", "list[int]") - | A list of attribute ID ints. - - +returns("feat_array") - | A feature matrix, with one row per word, and one column per attribute - | indicated in the input attr_ids. - - +method("count_by", "attr_id") - - | Produce a dict of {attribute (int): count (ints)} frequencies, keyed - | by the values of the given attribute ID. + +method("from_array", "attrs, array") + | Load from array - pre.language-python - code - | >>> from spacy.en import English, attrs - | >>> nlp = English() - | >>> tokens = nlp(u'apple apple orange banana') - | >>> tokens.count_by(attrs.ORTH) - | {12800L: 1, 11880L: 2, 7561L: 1} - | >>> tokens.to_array([attrs.ORTH]) - | array([[11880], - | [11880], - | [7561], - | [12800]]) - - +method("from_array", "attrs, array") - | Load from array - - +method("to_bytes") - | Serialize - - +method("from_bytes") - | Deserialize, loading from bytes - - +method("read_bytes") - | classmethod - - +method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - - | Merge a multi-word expression into a single token. Currently - | experimental; API is likely to change. - - - +declare_class("spacy.tokens.Token") - +method("__init__", "vocab, doc, offset") - +params - +param("vocab", Vocab_type) - p A Vocab object - - +param("doc", Doc_type) - p The parent sequence - - +param("offset", Int_type) - p The index of the token within the document - - details - summary: h4 String Views - - +attribute("orth / orth_") - | The form of the word with no string normalization or processing, as - | it appears in the string, without trailing whitespace. - - +attribute("lemma / lemma_") - | The "base" of the word, with no inflectional suffixes, e.g. the lemma of - | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - | derivational suffixes are not stripped, e.g. the lemma of - | "instutitions" is "institution", not "institute". Lemmatization is - | performed using the WordNet data, but extended to also cover closed-class - | words such as pronouns. By default, the WN lemmatizer returns "hi" - | as the lemma of "his". We assign pronouns the lemma -PRON-. - - +attribute("lower / lower_") - | The form of the word, but forced to lower-case, i.e. - pre.language-python: code lower = word.orth\_.lower() - - //+attribute("norm / norm_") - // | The form of the word, after language-specific normalizations has been - // | applied. - - +attribute("shape / shape_") - | A transform of the word's string, to show orthographic features. - | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped - | to d. After these mappings, sequences of 4 or more of the same character - | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - | :) --> :) - - +attribute("prefix / prefix_") - | A length-N substring from the start of the word. Length may vary by - | language; currently for English n=1, i.e. - pre.language-python: code prefix = word.orth\_[:1] - - +attribute("suffix / suffix_") - | A length-N substring from the end of the word. Length may vary by - | language; currently for English n=3, i.e. - pre.language-python: code suffix = word.orth\_[-3:] - - //+attribute("lex_id") - // | lex_id - - details - summary: h4 Alignment and Output - - +attribute("idx") - p Start index of the token in the string - - +method("__len__", "") - p Length of the token's orth string, in unicode code-points. - - +method("__unicode__", "") - p Same as token.orth_ - - +method("__str__", "") - p Varies between Python 2 and Python 3 - - +attribute("string") - p - | The form of the word as it appears in the string, including - | trailing whitespace. This is useful when you need to use - | linguistic features to add inline mark-up to the string. - - +method("nbor, i=1") - +params - +param("i") - p Offset relative to token + +method("from_bytes") + | Deserialize, loading from bytes - details - summary: h4 Distributional Features + +method("read_bytes") + | classmethod - +attribute("repvec") - p - | A "word embedding" representation: a dense real-valued vector that supports - | similarity queries between words. By default, spaCy currently loads - | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - | model. + //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - +attribute("cluster") - p - | The Brown cluster ID of the word. These are often useful features for - | linear models. If you're using a non-linear model, particularly a - | neural net or random forest, consider using the real-valued word - | representation vector, in Token.repvec, instead. + // | Merge a multi-word expression into a single token. Currently + // | experimental; API is likely to change. - +attribute("prob") - p - | The unigram log-probability of the word, estimated from counts from a - | large corpus, smoothed using Simple Good Turing estimation. - details - summary: h4 Syntactic Tags + +declare_class("Token") + +init + +method("__init__", "vocab, doc, offset") + +params + +param("vocab", types.Vocab) + p A Vocab object - +attribute("pos / pos_") - | A part-of-speech tag, from the Google Universal Tag Set, e.g. - | code>NOUN, VERB, ADV. Constants for - | the 17 tag values are provided in spacy.parts_of_speech. + +param("doc", types.Doc) + p The parent sequence - +attribute("tag / tag_") - | A morphosyntactic tag, e.g. NN, VBZ, - | DT, etc. These tags are language/corpus specific, and - | typically describe part-of-speech and some amount of morphological - | information. For instance, in the Penn Treebank tag set, VBZ - | is assigned to a present-tense singular verb. + +param("offset", types.int) + p The index of the token within the document - +attribute("dep / dep_") - | The type of syntactic dependency relation between the word and its - | syntactic head. + details + summary: h4 String Views - details - summary: h4 Navigating the Parse Tree - - +attribute("head") - p - | The Token that is the immediate syntactic head of the word. If the - | word is the root of the dependency tree, the same word is returned. + +attribute("orth / orth_") + | The form of the word with no string normalization or processing, as + | it appears in the string, without trailing whitespace. - +attribute("lefts") - p - | An iterator for the immediate leftward syntactic children of the - | word. + +attribute("lemma / lemma_") + | The "base" of the word, with no inflectional suffixes, e.g. the lemma of + | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that + | derivational suffixes are not stripped, e.g. the lemma of + | "instutitions" is "institution", not "institute". Lemmatization is + | performed using the WordNet data, but extended to also cover closed-class + | words such as pronouns. By default, the WN lemmatizer returns "hi" + | as the lemma of "his". We assign pronouns the lemma -PRON-. - +attribute("rights") - p - | An iterator for the immediate rightward syntactic children of the - | word. + +attribute("lower / lower_") + | The form of the word, but forced to lower-case, i.e. + pre.language-python: code lower = word.orth\_.lower() - +attribute("n_lefts") - p - | The number of immediate syntactic children preceding the word in - | the string. + //+attribute("norm / norm_") + // | The form of the word, after language-specific normalizations has been + // | applied. - +attribute("n_rights") - p - | The number of immediate syntactic children following the word in - | the string. + +attribute("shape / shape_") + | A transform of the word's string, to show orthographic features. + | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped + | to d. After these mappings, sequences of 4 or more of the same character + | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + | :) --> :) - +attribute("children") - p - | An iterator that yields from lefts, and then yields from rights. + +attribute("prefix / prefix_") + | A length-N substring from the start of the word. Length may vary by + | language; currently for English n=1, i.e. + pre.language-python: code prefix = word.orth\_[:1] - +attribute("subtree") - p - | An iterator for the part of the sentence syntactically governed by - | the word, including the word itself. + +attribute("suffix / suffix_") + | A length-N substring from the end of the word. Length may vary by + | language; currently for English n=3, i.e. + pre.language-python: code suffix = word.orth\_[-3:] - +attribute("left_edge") - p The leftmost edge of the token's subtree + //+attribute("lex_id") + // | lex_id - +attribute("right_edge") - p The rightmost edge of the token's subtree + details + summary: h4 Alignment and Output - details - summary: h4 Named Entities + +attribute("idx") + p Start index of the token in the string - +attribute("ent_type") - p If the token is part of an entity, its entity type. + +method("__len__", "") + p Length of the token's orth string, in unicode code-points. - +attribute("ent_iob") - p The IOB (inside, outside, begin) entity recognition tag for the token. + +method("__unicode__", "") + p Same as token.orth_ - details - summary: h4 Lexeme Flags + +method("__str__", "") + p Varies between Python 2 and Python 3 - +method("check_flag", "flag_id") - +params - +param("flag_id") - | flag ID + +attribute("string") + p + | The form of the word as it appears in the string, including + | trailing whitespace. This is useful when you need to use + | linguistic features to add inline mark-up to the string. - +attribute("is_oov") - +attribute("is_alpha") - +attribute("is_ascii") - +attribute("is_digit") - +attribute("is_lower") - +attribute("is_title") - +attribute("is_punct") - +attribute("is_space") - +attribute("like_url") - +attribute("like_num") - +attribute("like_email") + +method("nbor, i=1") + +params + +param("i") + p Offset relative to token + + details + summary: h4 Distributional Features + + +attribute("repvec") + p + | A "word embedding" representation: a dense real-valued vector that supports + | similarity queries between words. By default, spaCy currently loads + | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec + | model. + + +attribute("cluster") + p + | The Brown cluster ID of the word. These are often useful features for + | linear models. If you're using a non-linear model, particularly a + | neural net or random forest, consider using the real-valued word + | representation vector, in Token.repvec, instead. + + +attribute("prob") + p + | The unigram log-probability of the word, estimated from counts from a + | large corpus, smoothed using Simple Good Turing estimation. + + details + summary: h4 Syntactic Tags + + +attribute("pos / pos_") + p + | A part-of-speech tag, from the Google Universal Tag Set, e.g. + | code>NOUN, VERB, ADV. Constants for + | the 17 tag values are provided in spacy.parts_of_speech. + + +attribute("tag / tag_") + p + | A morphosyntactic tag, e.g. NN, VBZ, + | DT, etc. These tags are language/corpus specific, and + | typically describe part-of-speech and some amount of morphological + | information. For instance, in the Penn Treebank tag set, VBZ + | is assigned to a present-tense singular verb. + + +attribute("dep / dep_") + p + | The type of syntactic dependency relation between the word and its + | syntactic head. + + details + summary: h4 Navigating the Parse Tree + + +attribute("head") + p + | The Token that is the immediate syntactic head of the word. If the + | word is the root of the dependency tree, the same word is returned. + + +attribute("lefts") + p + | An iterator for the immediate leftward syntactic children of the + | word. + + +attribute("rights") + p + | An iterator for the immediate rightward syntactic children of the + | word. + + +attribute("n_lefts") + p + | The number of immediate syntactic children preceding the word in + | the string. + + +attribute("n_rights") + p + | The number of immediate syntactic children following the word in + | the string. + + +attribute("children") + p + | An iterator that yields from lefts, and then yields from rights. + + +attribute("subtree") + p + | An iterator for the part of the sentence syntactically governed by + | the word, including the word itself. + + +attribute("left_edge") + p The leftmost edge of the token's subtree + + +attribute("right_edge") + p The rightmost edge of the token's subtree + + details + summary: h4 Named Entities + + +attribute("ent_type") + p If the token is part of an entity, its entity type. + + +attribute("ent_iob") + p The IOB (inside, outside, begin) entity recognition tag for the token. + + details + summary: h4 Lexeme Flags + + +method("check_flag", "flag_id") + +params + +param("flag_id") + | flag ID + + +attribute("is_oov") + +attribute("is_alpha") + +attribute("is_ascii") + +attribute("is_digit") + +attribute("is_lower") + +attribute("is_title") + +attribute("is_punct") + +attribute("is_space") + +attribute("like_url") + +attribute("like_num") + +attribute("like_email") + + //+attribute("conjuncts") + // | Conjuncts + + +declare_class("Span") + +init + +method("__init__") + Temp - //+attribute("conjuncts") - // | Conjuncts + span = doc[0:4] - +declare_class("spacy.tokens.span.Span") - +params - +method("__getitem__") - p Get item - - +method("__iter__") - p Iter - - +method("__len__") - p Len - - +attribute("root") - p Syntactic head - - +attribute("lefts") - p Tokens that are: - ol - li To the left of the span; - li Syntactic children of words within the span - - p i.e. - - pre.language-python - code - | lefts = [span.doc[i] for i in range(0, span.start) - | if span.doc[i].head in span] - - +attribute("rights") - p Tokens that are: - ol - li To the right of the span; + +sequence + +method("__getitem__") + p Get item + + +method("__iter__") + p Iter + + +method("__len__") + p Len + + details + summary: h4 Parse + + +attribute("root") + p Syntactic head + + +attribute("lefts") + p Tokens that are: + ol + li To the left of the span; li Syntactic children of words within the span - p i.e. - pre.language-python - code - | rights = [span.doc[i] for i in range(span.end, len(span.doc)) - | if span.doc[i].head in span] - - +attribute("string") - p String - +attribute("lemma / lemma_") - p String - - +attribute("label / label_") - p String - - +attribute("subtree") - p String - - +declare_class("spacy.vocab.Vocab", "data_dir=None, lex_props_getter=None") - +method("__len__") - +returns - p Number of words in the vocabulary. + p i.e. - +method("__getitem__", "key_int") - +params - +param("key") - p Integer ID + pre.language-python + code + | lefts = [span.doc[i] for i in range(0, span.start) + | if span.doc[i].head in span] - +returns: p A Lexeme object + +attribute("rights") + p Tokens that are: + ol + li To the right of the span; + li Syntactic children of words within the span + p i.e. + pre.language-python + code + | rights = [span.doc[i] for i in range(span.end, len(span.doc)) + | if span.doc[i].head in span] - +method("__getitem__", "key_str") - +params - +param("key_str", unicode_type) - p A string in the vocabulary - +returns("Lexeme") + +attribute("subtree") + p String - +method("__setitem__", "orth_str", "props") - +params - +param("orth_str", unicode_type) - p The orth key + details + summary: h4 String Views - +param("props", dict_type) - p A props dictionary + +attribute("string") + p String + + +attribute("lemma / lemma_") + p String - +returns("None") + +attribute("label / label_") + p String - +method("dump", "loc") - +params - +param("loc", unicode_type) - p Path where the vocabulary should be saved + +declare_class("Lexeme") + p + | The Lexeme object represents a lexical type, stored in the vocabulary + | – as opposed to a token, occurring in a document. + p + | Lexemes store various features, so that these features can be computed + | once per type, rather than once per token. As job sizes grow, this + | can amount to a substantial efficiency improvement. - +method("load_lexemes", "loc") - +params - +param("loc", unicode_type) - p Path to load the lexemes.bin file from + p + | All Lexeme attributes are therefore context independent, as a single + | lexeme is reused for all usages of that word. Lexemes are keyed by + | the “orth” attribute. - +method("load_vectors", "loc") - +params - +param("loc", unicode_type) - p Path to load the vectors.bin from + p + All Lexeme attributes are accessible directly on the Token object. - - +declare_class("spacy.strings.StringStore") - +method("__len__") - +returns("int") - p Number of strings in the string-store + +init + +method("__init__") + p Init - +method("__getitem__", "key_int") - +params - +param("key_int") - p An integer key + details + summary: h4 String Features - +returns(unicode_type) - p The string that the integer key maps to + +attribute("orth / orth_") + p + | The form of the word with no string normalization or processing, + | as it appears in the string, without trailing whitespace. + + +attribute("lower / lower_") + p Tmp + + +attribute("norm / norm_") + p Tmp + + +attribute("shape / shape_") + p Tmp + + +attribute("prefix / prefix_") + p Tmp + + +attribute("suffix / suffix_") + p TMP - +method("__getitem__", "key_unicode") - +params - +param("key_unicode") - p A key, as a unicode string + +declare_class("Vocab", "data_dir=None, lex_props_getter=None") + +sequence + +method("__len__") + +returns + p Number of words in the vocabulary. - +returns(int_type) - p The integer ID of the string. + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key") + p Integer ID + + +returns: p A Lexeme object + + +method("__getitem__", "key_str") + +params + +param("key_str", types.unicode) + p A string in the vocabulary + + +returns("Lexeme") + + +method("__setitem__", "orth_str", "props") + +params + +param("orth_str", types.unicode) + p The orth key + + +param("props", types.dict) + p A props dictionary + + +returns("None") - +method("__getitem__", "key_utf8_bytes") - +params - +param("key_utf8_bytes", bytes_type) - p p A key, as a UTF-8 encoded byte-string + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc", types.unicode) + p Path where the vocabulary should be saved + + +method("load_lexemes", "loc") + +params + +param("loc", types.unicode) + p Path to load the lexemes.bin file from + + +method("load_vectors", "loc") + +params + +param("loc", types.unicode) + p Path to load the vectors.bin from - +returns(int_type) - p The integer ID of the string. + +declare_class("StringStore") + +init + Tmp - +method("dump", "loc") - +params - +param("loc") - p File path to save the strings.txt to. + +sequence + +method("__len__") + +returns("int") + p Number of strings in the string-store + + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key_int") + p An integer key + + +returns(types.unicode) + p The string that the integer key maps to + + +method("__getitem__", "key_unicode") + +params + +param("key_unicode") + p A key, as a unicode string + + +returns(types.int) + p The integer ID of the string. + + +method("__getitem__", "key_utf8_bytes") + +params + +param("key_utf8_bytes", types.bytes) + p p A key, as a UTF-8 encoded byte-string + + +returns(types.int) + p The integer ID of the string. + + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc") + p File path to save the strings.txt to. + + +method("load") + +params + +param("loc") + p File path to load the strings.txt from. - +method("load") - +params - +param("loc") - p File path to load the strings.txt from. - script(src="js/prism.js") From 1db080047bc61186e668ea9d87a51db86e4650e9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 22:39:48 +0200 Subject: [PATCH 060/138] * More work on docs --- docs/redesign/spacy_home.jade | 286 ++++++++-------------------------- 1 file changed, 63 insertions(+), 223 deletions(-) diff --git a/docs/redesign/spacy_home.jade b/docs/redesign/spacy_home.jade index 688863923..c89d830cd 100644 --- a/docs/redesign/spacy_home.jade +++ b/docs/redesign/spacy_home.jade @@ -1,22 +1,33 @@ -- var slogan = "Build Tomorrow's Language Technologies" -- var tag_line = "spaCy – #{slogan}" -- var a_minor_miracle = 'a minor miracle' +extends ./outline.jade -mixin lede() +// Notes +// +// 1. Where to put version notice? Should say something like +// 2015-08-12: v0.89 +// and be a link +// +// Only needs to appear on home page. + + +- var slogan = "Build Tomorrow's Language Technologies" +- var tag_line = "spaCy – " + slogan + +mixin lede + - var state_of_the_art = 'state-of-the-art' + - var a_minor_miracle = 'a minor miracle' + - var great_documentation = 'great documentation' + p. - spaCy is a library for industrial-strength NLP in Python and - Cython. It features state-of-the-art speed and accuracy, a concise API, and - great documentation. If you're a small company doing NLP, we want spaCy to - seem like !{a_minor_miracle}. + spaCy is a + library for industrial-strength NLP in Python and Cython. It features + !{state_of_the_art} speed and accuracy, a concise API, and great documentation. + If you're a small company doing NLP, we want spaCy to seem + like !{a_minor_miracle}. mixin overview() p. Overview text -mixin example() - p. - Example text - mixin benchmarks() p. Benchmarks @@ -25,18 +36,11 @@ mixin get_started() p. Get Started -mixin example(name) - details - summary - span(class="example-name")= name - - block mixin comparison(name) details summary - h4 - name + h4= name block @@ -52,215 +56,51 @@ mixin row(...cells) td= cell -doctype html -html(lang="en") - head - meta(charset="utf-8") - title!= tag_line - meta(name="description" content="") - meta(name="author" content="Matthew Honnibal") - link(rel="stylesheet" href="css/style.css") - +mixin social + footer(role="contentinfo") + a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter - body(id="page" role="document") - header(role="banner") - h1(class="logo")!= tag_line - div(class="slogan")!= slogan + div.discuss + a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") + | Discuss on Hacker News + + a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") + | Discuss on Reddit + + +mixin Section(title_text, link_name, include_file) + a(name=link_name): h3 #{title_text} + + if (link_name == "example-use") + include ./usage_examples.jade + else if (link_name == "online-demo") + include ./online_demo.jade + else if (link_name == "comparisons") + include ./comparisons.jade + else if (link_name == "install") + include ./installation.jade + + +block intro_block + section(class="intro") + +lede nav(role="navigation") ul - li: a(href="#") Home - li: a(href="#") Docs - li: a(href="#") License - li: a(href="#") Blog - - main(id="content" role="main") - section(class="intro") - +lede - - nav(role="navigation") - ul - li: a(href="#overview" class="button") Examples - li: a(href="#overview" class="button") Comparisons - li: a(href="#example-use" class="button") Demo - li: a(href="#get-started" class="button") Install - - article(class="page landing-page") - a(name="example-use"): h3 Usage by Example - - +example("Load resources and process text") - pre.language-python - code - | from __future__ import unicode_literals, print_function - | from spacy.en import English - | nlp = English() - | doc = nlp('Hello, world. Here are two sentences.') - - +example("Get tokens and sentences") - pre.language-python - code - | token = doc[0] - | sentence = doc.sents[0] - | assert token[0] is sentence[0] - - +example("Use integer IDs for any string") - pre.language-python - code - | hello_id = nlp.vocab.strings['Hello'] - | hello_str = nlp.vocab.strings[hello_id] - | - | assert token.orth == hello_id == 52 - | assert token.orth_ == hello_str == 'Hello' - - +example("Get and set string views and flags") - pre.language-python - code - | assert token.shape_ == 'Xxxx' - | for lexeme in nlp.vocab: - | if lexeme.is_alpha: - | lexeme.shape_ = 'W' - | elif lexeme.is_digit: - | lexeme.shape_ = 'D' - | elif lexeme.is_punct: - | lexeme.shape_ = 'P' - | else: - | lexeme.shape_ = 'M' - | assert token.shape_ == 'W' - - +example("Export to numpy arrays") - pre.language-python - code - | Do me - - +example("Word vectors") - pre.language-python - code - | Do me - - +example("Part-of-speech tags") - pre.language-python - code - | Do me - - +example("Syntactic dependencies") - pre.language-python - code - | Do me - - +example("Named entities") - pre.language-python - code - | Do me - - +example("Define custom NER rules") - pre.language-python - code - | Do me - - +example("Calculate inline mark-up on original string") - pre.language-python - code - | Do me - - +example("Efficient binary serialization") - pre.language-python - code - | Do me - - a(name="benchmarks"): h3 Benchmarks - - +comparison("spaCy vs. NLTK") - +comparison("spaCy vs. Pattern") - +comparison("spaCy vs. CoreNLP") - +comparison("spaCy vs. ClearNLP") - +comparison("spaCy vs. OpenNLP") - +comparison("spaCy vs. GATE") - - details - summary: h4 Independent Evaluation - - p - | Independent evaluation by Yahoo! Labs and Emory - | University, to appear at ACL 2015. Higher is better. - - table - thead - +columns("System", "Language", "Accuracy", "Speed") - - tbody - +row("spaCy v0.86", "Cython", "91.9", "13,963") - +row("spaCy v0.84", "Cython", "90.6", "13,963") - +row("ClearNLP", "Java", "91.7", "10,271") - +row("CoreNLP", "Java", "89.6", "8,602") - +row("MATE", "Java", "92.5", "550") - +row("Turbo", "C++", "92.4", "349") - +row("Yara", "Java", "92.3", "340") - - p - | Accuracy is % unlabelled arcs correct, speed is tokens per second. - - p - | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) - | performed a detailed comparison of the best parsers available. - | All numbers above are taken from the pre-print they kindly made - | available to me, except for spaCy v0.86. - - p - | I'm particularly grateful to the authors for discussion of their - | results, which led to the improvement in accuracy between v0.84 and - | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly - | useful. - - details - summary: h4 Detailed Accuracy Comparison - - details - summary: h4 Detailed Speed Comparison - - table - thead - tr - th. - th(colspan=3) Absolute (ms per doc) - th(colspan=3) Relative (to spaCy) - - tbody - tr - td: strong System - td: strong Split - td: strong Tag - td: strong Parse - td: strong Split - td: strong Tag - td: strong Parse - - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") - +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") - +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") - - p - | Set up: 100,000 plain-text documents were streamed - | from an SQLite3 database, and processed with an NLP library, to one - | of three levels of detail – tokenization, tagging, or parsing. - | The tasks are additive: to parse the text you have to tokenize and - | tag it. The pre-processing was not subtracted from the times – - | I report the time required for the pipeline to complete. I report - | mean times per document, in milliseconds. - - p - | Hardware: Intel i7-3770 (2012) - - a(name="get-started"): h3 Get started - - +get_started + li: a(href="#example-use" class="button") Examples + li: a(href="#online-demo" class="button") Demo + li: a(href="#comparisons" class="button") Comparisons + li: a(href="#install" class="button") Install v0.89 +block body_block + article(class="page landing-page") + +Section("Usage by Example", "example-use", "./usage_examples.jade") - footer(role="contentinfo") + +Section("Online Demo", "online-demo", "./online_demo.jade") + + +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") + + +Section("Install", "install", "./install.jade") - script(src="js/prism.js") From a8594452936e2e6a436521eb088ebc1d4c0149f6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 22:40:20 +0200 Subject: [PATCH 061/138] * Rename --- docs/redesign/{spacy_home.jade => home.jade} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/redesign/{spacy_home.jade => home.jade} (100%) diff --git a/docs/redesign/spacy_home.jade b/docs/redesign/home.jade similarity index 100% rename from docs/redesign/spacy_home.jade rename to docs/redesign/home.jade From 6f7b3efe85803dfee14cd66bd87598de823446e9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 22:49:57 +0200 Subject: [PATCH 062/138] * Rename spacy_docs to docs.jade --- docs/redesign/{spacy_docs.jade => docs.jade} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename docs/redesign/{spacy_docs.jade => docs.jade} (100%) diff --git a/docs/redesign/spacy_docs.jade b/docs/redesign/docs.jade similarity index 100% rename from docs/redesign/spacy_docs.jade rename to docs/redesign/docs.jade From b57a3ddd7ef745d8f48200ed843bdf7bb5c740e1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 12 Aug 2015 22:51:15 +0200 Subject: [PATCH 063/138] * Add pieces of home page as separate jade files --- docs/redesign/installation.jade | 40 +++++++++++ docs/redesign/online_demo.jade | 0 docs/redesign/outline.jade | 37 ++++++++++ docs/redesign/usage_examples.jade | 109 ++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+) create mode 100644 docs/redesign/installation.jade create mode 100644 docs/redesign/online_demo.jade create mode 100644 docs/redesign/outline.jade create mode 100644 docs/redesign/usage_examples.jade diff --git a/docs/redesign/installation.jade b/docs/redesign/installation.jade new file mode 100644 index 000000000..05f89dd24 --- /dev/null +++ b/docs/redesign/installation.jade @@ -0,0 +1,40 @@ +p With Python 2.7 or Python 3, using Linux or OSX, run: + +pre.language-bash: code + | $ pip install spacy + | $ python -m spacy.en.download + +p + | The download command fetches and installs about 300mb of data, for + | the parser model and word vectors, which it installs within the spacy.en + | package directory. + +p + | If you're stuck using a server with an old version of Python, and you + | don't have root access, I've prepared a bootstrap script to help you + | compile a local Python install. Run: + +pre.language-bash: code + | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate + +p + | The other way to install the package is to clone the github repository, + | and build it from source. This installs an additional dependency, + | Cython. If you're using Python 2, I also recommend installing fabric + | and fabtools – this is how I build the project. + +pre.language-bash: code + | $ git clone https://github.com/honnibal/spaCy.git + | $ cd spaCy + | $ virtualenv .env && source .env/bin/activate + | $ export PYTHONPATH=`pwd` + | $ pip install -r requirements.txt + | $ python setup.py build_ext --inplace + | $ python -m spacy.en.download + | $ pip install pytest + | $ py.test tests/ + +p + | Python packaging is awkward at the best of times, and it's particularly tricky + | with C extensions, built via Cython, requiring large data files. So, + | please report issues as you encounter them. diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade new file mode 100644 index 000000000..e69de29bb diff --git a/docs/redesign/outline.jade b/docs/redesign/outline.jade new file mode 100644 index 000000000..2389dc71e --- /dev/null +++ b/docs/redesign/outline.jade @@ -0,0 +1,37 @@ +- var slogan = "Build Tomorrow's Language Technologies" +- var tag_line = "spaCy – " + slogan + + +doctype html +html(lang="en") + head + meta(charset="utf-8") + title!= tag_line + meta(name="description" content="") + meta(name="author" content="Matthew Honnibal") + link(rel="stylesheet" href="css/style.css") + + + body(id="home" role="document") + header(role="banner") + h1(class="logo")!= tag_line + div(class="slogan")!= slogan + + nav(role="navigation") + ul + li: a(href="#") Home + li: a(href="#") Docs + li: a(href="#") License + li: a(href="#") More + + main(id="content" role="main") + block intro_block + + block body_block + + footer(role="contentinfo") + + script(src="js/prism.js") + script(src="js/details_polyfill.js") diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade new file mode 100644 index 000000000..d429339d4 --- /dev/null +++ b/docs/redesign/usage_examples.jade @@ -0,0 +1,109 @@ +mixin example(name) + details + summary + h4= name + block + + ++example("Load resources and process text") + pre.language-python: code + | from __future__ import unicode_literals, print_function + | from spacy.en import English + | nlp = English() + | doc = nlp('Hello, world. Here are two sentences.') + ++example("Get tokens and sentences") + pre.language-python: code + | token = doc[0] + | sentence = doc.sents[0] + | assert token[0] is sentence[0] + ++example("Use integer IDs for any string") + pre.language-python: code + | hello_id = nlp.vocab.strings['Hello'] + | hello_str = nlp.vocab.strings[hello_id] + | + | assert token.orth == hello_id == 52 + | assert token.orth_ == hello_str == 'Hello' + ++example("Get and set string views and flags") + pre.language-python: code + | assert token.shape_ == 'Xxxx' + | for lexeme in nlp.vocab: + | if lexeme.is_alpha: + | lexeme.shape_ = 'W' + | elif lexeme.is_digit: + | lexeme.shape_ = 'D' + | elif lexeme.is_punct: + | lexeme.shape_ = 'P' + | else: + | lexeme.shape_ = 'M' + | assert token.shape_ == 'W' + ++example("Export to numpy arrays") + pre.language-python: code + | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV + | + | attr_ids = [ORTH, LIKE_URL, IS_OOV] + | doc_array = doc.to_array(attr_ids) + | assert doc_array.shape == (len(doc), len(attrs) + | assert doc[0].orth == doc_array[0, 0] + | assert doc[1].orth == doc_array[1, 0] + | assert doc[0].like_url == doc_array[0, 1] + | assert doc_array[, 1] == [t.like_url for t in doc] + ++example("Word vectors") + pre.language-python: code + | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") + | + | apples = doc[0] + | oranges = doc[1] + | boots = doc[6] + | hippos = doc[8] + | + | assert apples.similarity(oranges) > boots.similarity(hippos) + + ++example("Part-of-speech tags") + pre.language-python: code + | doc[0].pos + | doc[0].tag + ++example("Syntactic dependencies") + pre.language-python: code + | for head in tokens: + | for child in head.lefts: + | assert child.head is head + | for child in head.rights: + | assert child.head is head + | sent = nlp('The four wheels on the bus turned quickly.') + | wheels = sent[2] + | bus = sent[5] + | assert len(list(wheels.lefts)) == 2 + | assert len(list(wheels.rights)) == 1 + | assert len(list(wheels.children)) == 3 + | assert len(list(bus.lefts)) == 1 + | assert len(list(bus.rights)) == 0 + | assert len(list(bus.children)) == 1 + | + | assert len(list(wheels.subtree)) == 6 + ++example("Named entities") + pre.language-python: code + | doc.ents + | token.ent_type + | token.ent_iob + ++example("Define custom NER rules") + pre.language-python: code + | nlp.matcher + ++example("Calculate inline mark-up on original string") + pre.language-python: code + | token.string + | token.spacy + | token.whitespace_ + ++example("Efficient binary serialization") + pre.language-python: code + | From ba00c72505257ba8bb25fd8ffca41a3a62e6f931 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 13 Aug 2015 01:11:40 +0200 Subject: [PATCH 064/138] * Add spec.jade --- docs/redesign/spec.jade | 123 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 docs/redesign/spec.jade diff --git a/docs/redesign/spec.jade b/docs/redesign/spec.jade new file mode 100644 index 000000000..a61a4f356 --- /dev/null +++ b/docs/redesign/spec.jade @@ -0,0 +1,123 @@ +extends ./outline.jade + +mixin columns(...names) + tr + each name in names + th= name + + +mixin row(...cells) + tr + each cell in cells + td= cell + + +block body_block + article(class="page docs-page") + p. + This document describes the target annotations spaCy is trained to predict. + This is currently a work in progress. Please ask questions on the issue tracker, + so that the answers can be integrated here to improve the documentation. + + h2 Tokenization + + p Tokenization standards are based on the OntoNotes 5 corpus. + + p. + The tokenizer differs from most by including tokens for significant + whitespace. Any sequence of whitespace characters beyond a single space + (' ') is included as a token. For instance: + + pre.language-python + code + | from spacy.en import English + | nlp = English(parse=False) + | tokens = nlp('Some\nspaces and\ttab characters') + | print([t.orth_ for t in tokens]) + + p Which produces: + + pre.language-python + code + | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] + + p. + The whitespace tokens are useful for much the same reason punctuation is + – it's often an important delimiter in the text. By preserving + it in the token output, we are able to maintain a simple alignment + between the tokens and the original string, and we ensure that no + information is lost during processing. + + h3 Sentence boundary detection + + p. + Sentence boundaries are calculated from the syntactic parse tree, so + features such as punctuation and capitalisation play an important but + non-decisive role in determining the sentence boundaries. Usually this + means that the sentence boundaries will at least coincide with clause + boundaries, even given poorly punctuated text. + + h3 Part-of-speech Tagging + + p. + The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank + tag set. We also map the tags to the simpler Google Universal POS Tag set. + + Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 + + h3 Lemmatization + + p. + A "lemma" is the uninflected form of a word. In English, this means: + + ul + li Adjectives: The form like "happy", not "happier" or "happiest" + li Adverbs: The form like "badly", not "worse" or "worst" + li Nouns: The form like "dog", not "dogs"; like "child", not "children" + li Verbs: The form like "write", not "writes", "writing", "wrote" or "written" + + p. + The lemmatization data is taken from WordNet. However, we also add a + special case for pronouns: all pronouns are lemmatized to the special + token -PRON-. + + + h3 Syntactic Dependency Parsing + + p. + The parser is trained on data produced by the ClearNLP converter. Details + of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf + + h3 Named Entity Recognition + + table + thead + +columns("Entity Type", "Description") + + tbody + +row("PERSON", "People, including fictional.") + +row("NORP", "Nationalities or religious or political groups.") + +row("FACILITY", "Buildings, airports, highways, bridges, etc.") + +row("ORG", "Companies, agencies, institutions, etc.") + +row("GPE", "Countries, cities, states.") + +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") + +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") + +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") + +row("WORK_OF_ART", "Titles of books, songs, etc.") + +row("LAW", "Named documents made into laws") + +row("LANGUAGE", "Any named language") + + p The following values are also annotated in a style similar to names: + + table + thead + +columns("Entity Type", "Description") + + tbody + +row("DATE", "Absolute or relative dates or periods") + +row("TIME", "Times smaller than a day") + +row("PERCENT", 'Percentage (including “%”)') + +row("MONEY", "Monetary values, including unit") + +row("QUANTITY", "Measurements, as of weight or distance") + +row("ORDINAL", 'first", "second"') + +row("CARDINAL", "Numerals that do not fall under another type") From 8a252d08f98335a94e7c56a3aa467365a136c807 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 13 Aug 2015 14:40:53 +0200 Subject: [PATCH 065/138] * Add parser post in jade --- docs/redesign/blog_parser.jade | 923 +++++++++++++++++++++++++++++++++ 1 file changed, 923 insertions(+) create mode 100644 docs/redesign/blog_parser.jade diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade new file mode 100644 index 000000000..34d312a1c --- /dev/null +++ b/docs/redesign/blog_parser.jade @@ -0,0 +1,923 @@ +- + var urls = { + 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', + 'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", + 'implementation': 'https://gist.github.com/syllog1sm/10343947', + 'redshift': 'http://github.com/syllog1sm/redshift', + 'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', + 'acl_anthology': 'http://aclweb.org/anthology/', + 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' + } + + +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + article.post + header + h2 Parsing English with 500 lines of Python + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2013-12-18') December 18, 2013 + p + | A + a(href=urls.google_ngrams) syntactic parser + | describes a sentence’s grammatical structure, to help another + | application reason about it. Natural languages introduce many unexpected + | ambiguities, which our world-knowledge immediately filters out. A + | favourite example: + + p.example They ate the pizza with anchovies + + p + img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') + p + | A correct parse links “with” to “pizza”, while an incorrect parse + | links “with” to “eat”: + + .displacy + iframe(src='displacy/anchovies_bad.html', height='275') + + .displacy + iframe.displacy(src='displacy/anchovies_good.html', height='275') + a.view-displacy(href='#') View on displaCy + p.caption + | The Natural Language Processing (NLP) community has made big progress + | in syntactic parsing over the last few years. + + p + | The Natural Language Processing (NLP) community has made big progress + | in syntactic parsing over the last few years. It’s now possible for + | a tiny Python implementation to perform better than the widely-used + | Stanford PCFG parser. + + p + strong Update! + | The Stanford CoreNLP library now includes a greedy transition-based + | dependency parser, similar to the one described in this post, but with + | an improved learning strategy. It is much faster and more accurate + | than this simple Python implementation. + + table + thead + tr + th Parser + th Accuracy + th Speed (w/s) + th Language + th LOC + tbody + tr + td Stanford + td 89.6% + td 19 + td Java + td + | > 50,000 + sup + a(href='#note-1') [1] + tr + td + strong parser.py + td 89.8% + td 2,020 + td Python + td + strong ~500 + tr + td Redshift + td + strong 93.6% + td + strong 2,580 + td Cython + td ~4,000 + p + | The rest of the post sets up the problem, and then takes you through + a(href=urls.implementation) a concise implementation + | , prepared for this post. The first 200 lines of parser.py, the + | part-of-speech tagger and learner, are described + a(href=pos_tagger_url) here. You should probably at least skim that + | post before reading this one, unless you’re very familiar with NLP + | research. + p + | The Cython system, Redshift, was written for my current research. I + | plan to improve it for general use in June, after my contract ends + | at Macquarie University. The current version is + a(href=urls.redshift) hosted on GitHub + | . + h3 Problem Description + + p It’d be nice to type an instruction like this into your phone: + + p.example + Set volume to zero when I’m in a meeting, unless John’s school calls. + p + | And have it set the appropriate policy. On Android you can do this + | sort of thing with + a(href=urls.tasker) Tasker + | , but an NL interface would be much better. It’d be especially nice + | to receive a meaning representation you could edit, so you could see + | what it thinks you said, and correct it. + p + | There are lots of problems to solve to make that work, but some sort + | of syntactic representation is definitely necessary. We need to know that: + + p.example + Unless John’s school calls, when I’m in a meeting, set volume to zero + + p is another way of phrasing the first instruction, while: + + p.example + Unless John’s school, call when I’m in a meeting + + p means something completely different. + + p + | A dependency parser returns a graph of word-word relationships, + | intended to make such reasoning easier. Our graphs will be trees – + | edges will be directed, and every node (word) will have exactly one + | incoming arc (one dependency, with its head), except one. + + h4 Example usage + + pre.language-python. + + p. + The idea is that it should be slightly easier to reason from the parse, + than it was from the string. The parse-to-meaning mapping is hopefully + simpler than the string-to-meaning mapping. + + p. + The most confusing thing about this problem area is that “correctness” + is defined by convention — by annotation guidelines. If you haven’t + read the guidelines and you’re not a linguist, you can’t tell whether + the parse is “wrong” or “right”, which makes the whole task feel weird + and artificial. + + p. + For instance, there’s a mistake in the parse above: “John’s school + calls” is structured wrongly, according to the Stanford annotation + guidelines. The structure of that part of the sentence is how the + annotators were instructed to parse an example like “John’s school + clothes”. + + p + | It’s worth dwelling on this point a bit. We could, in theory, have + | written our guidelines so that the “correct” parses were reversed. + | There’s good reason to believe the parsing task will be harder if we + | reversed our convention, as it’d be less consistent with the rest of + | the grammar. + sup: a(href='#note-2') [2] + | But we could test that empirically, and we’d be pleased to gain an + | advantage by reversing the policy. + + p + | We definitely do want that distinction in the guidelines — we don’t + | want both to receive the same structure, or our output will be less + | useful. The annotation guidelines strike a balance between what + | distinctions downstream applications will find useful, and what + | parsers will be able to predict easily. + + h4 Projective trees + + p + | There’s a particularly useful simplification that we can make, when + | deciding what we want the graph to look like: we can restrict the + | graph structures we’ll be dealing with. This doesn’t just give us a + | likely advantage in learnability; it can have deep algorithmic + | implications. We follow most work on English in constraining the + | dependency graphs to be + em projective trees + | : + + ol + li Tree. Every word has exactly one head, except for the dummy ROOT symbol. + li + | Projective. For every pair of dependencies (a1, a2) and (b1, b2), + | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. + | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or + | b1 a1 b2 a2. + + p + | There’s a rich literature on parsing non-projective trees, and a + | smaller literature on parsing DAGs. But the parsing algorithm I’ll + | be explaining deals with projective trees. + + h3 Greedy transition-based parsing + + p + | Our parser takes as input a list of string tokens, and outputs a + | list of head indices, representing edges in the graph. If the + + em i + + | th member of heads is + + em j + + | , the dependency parse contains an edge (j, i). A transition-based + | parser is a finite-state transducer; it maps an array of N words + | onto an output array of N head indices: + + table.center + tbody + tr + td + em start + td MSNBC + td reported + td that + td Facebook + td bought + td WhatsApp + td for + td $16bn + td + em root + tr + td 0 + td 2 + td 9 + td 2 + td 4 + td 2 + td 4 + td 4 + td 7 + td 0 + p + | The heads array denotes that the head of + em MSNBC + | is + em reported + | : + em MSNBC + | is word 1, and + em reported + | is word 2, and + code.language-python heads[1] == 2 + | . You can already see why parsing a tree is handy — this data structure + | wouldn’t work if we had to output a DAG, where words may have multiple + | heads. + + p + | Although + code.language-python heads + | can be represented as an array, we’d actually like to maintain some + | alternate ways to access the parse, to make it easy and efficient to + | extract features. Our + + code.language-python Parse + | class looks like this: + + pre.language-python + code + | class Parse(object): + | def __init__(self, n): + | self.n = n + | self.heads = [None] * (n-1) + | self.lefts = [] + | self.rights = [] + | for i in range(n+1): + | self.lefts.append(DefaultList(0)) + | self.rights.append(DefaultList(0)) + | + | def add_arc(self, head, child): + | self.heads[child] = head + | if child < head: + | self.lefts[head].append(child) + | else: + | self.rights[head].append(child) + + p + | As well as the parse, we also have to keep track of where we’re up + | to in the sentence. We’ll do this with an index into the + code.language-python words + | array, and a stack, to which we’ll push words, before popping them + | once their head is set. So our state data structure is fundamentally: + + ul + li An index, i, into the list of tokens; + li The dependencies added so far, in Parse + li + | A stack, containing words that occurred before i, for which we’re + | yet to assign a head. + + p Each step of the parsing process applies one of three actions to the state: + + pre.language-python + code + | SHIFT = 0; RIGHT = 1; LEFT = 2 + | MOVES = [SHIFT, RIGHT, LEFT] + | + | def transition(move, i, stack, parse): + | global SHIFT, RIGHT, LEFT + | if move == SHIFT: + | stack.append(i) + | return i + 1 + | elif move == RIGHT: + | parse.add_arc(stack[-2], stack.pop()) + | return i + | elif move == LEFT: + | parse.add_arc(i, stack.pop()) + | return i + | raise GrammarError("Unknown move: %d" % move) + + + + p + | The + code.language-python LEFT + | and + code.language-python RIGHT + | actions add dependencies and pop the stack, while + code.language-python SHIFT + | pushes the stack and advances i into the buffer. + p. + So, the parser starts with an empty stack, and a buffer index at 0, with + no dependencies recorded. It chooses one of the (valid) actions, and + applies it to the state. It continues choosing actions and applying + them until the stack is empty and the buffer index is at the end of + the input. (It’s hard to understand this sort of algorithm without + stepping through it. Try coming up with a sentence, drawing a projective + parse tree over it, and then try to reach the parse tree by choosing + the right sequence of transitions.) + + p Here’s what the parsing loop looks like in code: + + pre.language-python + code + | class Parser(object): + | ... + | def parse(self, words): + | tags = self.tagger(words) + | n = len(words) + | idx = 1 + | stack = [0] + | deps = Parse(n) + | while stack or idx < n: + | features = extract_features(words, tags, idx, n, stack, deps) + | scores = self.model.score(features) + | valid_moves = get_valid_moves(i, n, len(stack)) + | next_move = max(valid_moves, key=lambda move: scores[move]) + | idx = transition(next_move, idx, stack, parse) + | return tags, parse + | + | def get_valid_moves(i, n, stack_depth): + | moves = [] + | if i < n: + | moves.append(SHIFT) + | if stack_depth <= 2: + | moves.append(RIGHT) + | if stack_depth <= 1: + | moves.append(LEFT) + | return moves + + p. + We start by tagging the sentence, and initializing the state. We then + map the state to a set of features, which we score using a linear model. + We then find the best-scoring valid move, and apply it to the state. + + p + | The model scoring works the same as it did in + a(href=urls.post) the POS tagger. + | If you’re confused about the idea of extracting features and scoring + | them with a linear model, you should review that post. Here’s a reminder + | of how the model scoring works: + + pre.language-python + code + | class Perceptron(object) + | ... + | def score(self, features): + | all_weights = self.weights + | scores = dict((clas, 0) for clas in self.classes) + | for feat, value in features.items(): + | if value == 0: + | continue + | if feat not in all_weights: + | continue + | weights = all_weights[feat] + | for clas, weight in weights.items(): + | scores[clas] += value * weight + | return scores + + p. + It’s just summing the class-weights for each feature. This is often + expressed as a dot-product, but when you’re dealing with multiple + classes, that gets awkward, I find. + + p. + The beam parser (RedShift) tracks multiple candidates, and only decides + on the best one at the very end. We’re going to trade away accuracy + in favour of efficiency and simplicity. We’ll only follow a single + analysis. Our search strategy will be entirely greedy, as it was with + the POS tagger. We’ll lock-in our choices at every step. + + p. + If you read the POS tagger post carefully, you might see the underlying + similarity. What we’ve done is mapped the parsing problem onto a + sequence-labelling problem, which we address using a “flat”, or unstructured, + learning algorithm (by doing greedy search). + + h3 Features + p. + Feature extraction code is always pretty ugly. The features for the parser + refer to a few tokens from the context: + + ul + li The first three words of the buffer (n0, n1, n2) + li The top three words of the stack (s0, s1, s2) + li The two leftmost children of s0 (s0b1, s0b2); + li The two rightmost children of s0 (s0f1, s0f2); + li The two leftmost children of n0 (n0b1, n0b2) + + p. + For these 12 tokens, we refer to the word-form, the part-of-speech tag, + and the number of left and right children attached to the token. + + p. + Because we’re using a linear model, we have our features refer to pairs + and triples of these atomic properties. + + pre.language-python + code + | def extract_features(words, tags, n0, n, stack, parse): + | def get_stack_context(depth, stack, data): + | if depth >= 3: + | return data[stack[-1]], data[stack[-2]], data[stack[-3]] + | elif depth >= 2: + | return data[stack[-1]], data[stack[-2]], '' + | elif depth == 1: + | return data[stack[-1]], '', '' + | else: + | return '', '', '' + | + | def get_buffer_context(i, n, data): + | if i + 1 >= n: + | return data[i], '', '' + | elif i + 2 >= n: + | return data[i], data[i + 1], '' + | else: + | return data[i], data[i + 1], data[i + 2] + | + | def get_parse_context(word, deps, data): + | if word == -1: + | return 0, '', '' + | deps = deps[word] + | valency = len(deps) + | if not valency: + | return 0, '', '' + | elif valency == 1: + | return 1, data[deps[-1]], '' + | else: + | return valency, data[deps[-1]], data[deps[-2]] + | + | features = {} + | # Set up the context pieces --- the word, W, and tag, T, of: + | # S0-2: Top three words on the stack + | # N0-2: First three words of the buffer + | # n0b1, n0b2: Two leftmost children of the first word of the buffer + | # s0b1, s0b2: Two leftmost children of the top word of the stack + | # s0f1, s0f2: Two rightmost children of the top word of the stack + | + | depth = len(stack) + | s0 = stack[-1] if depth else -1 + | + | Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) + | Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) + | + | Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) + | Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) + | + | Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) + | Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) + | + | Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) + | _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) + | + | Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) + | _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) + | + | Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) + | _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) + | + | # Cap numeric features at 5? + | # String-distance + | Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 + | + | features['bias'] = 1 + | # Add word and tag unigrams + | for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): + | if w: + | features['w=%s' % w] = 1 + | for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): + | if t: + | features['t=%s' % t] = 1 + | + | # Add word/tag pairs + | for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): + | if w or t: + | features['%d w=%s, t=%s' % (i, w, t)] = 1 + | + | # Add some bigrams + | features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 + | features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 + | features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 + | features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 + | features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 + | features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 + | features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 + | features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 + | + | # Add some tag trigrams + | trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), + | (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), + | (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), + | (Ts0, Ts1, Ts1)) + | for i, (t1, t2, t3) in enumerate(trigrams): + | if t1 or t2 or t3: + | features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 + | + | # Add some valency and distance features + | vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) + | vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) + | d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), + | ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) + | for i, (w_t, v_d) in enumerate(vw + vt + d): + | if w_t or v_d: + | features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 + | return features + + + h3 Training + + p. + Weights are learned using the same algorithm, averaged perceptron, that + we used for part-of-speech tagging. Its key strength is that it’s an + online learning algorithm: examples stream in one-by-one, we make our + prediction, check the actual answer, and adjust our beliefs (weights) + if we were wrong. + + p The training loop looks like this: + + pre.language-python + code + | class Parser(object): + | ... + | def train_one(self, itn, words, gold_tags, gold_heads): + | n = len(words) + | i = 2; stack = [1]; parse = Parse(n) + | tags = self.tagger.tag(words) + | while stack or (i + 1) < n: + | features = extract_features(words, tags, i, n, stack, parse) + | scores = self.model.score(features) + | valid_moves = get_valid_moves(i, n, len(stack)) + | guess = max(valid_moves, key=lambda move: scores[move]) + | gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) + | best = max(gold_moves, key=lambda move: scores[move]) + | self.model.update(best, guess, features) + | i = transition(guess, i, stack, parse) + | # Return number correct + | return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) + + + + p. + The most interesting part of the training process is in + code.language-python get_gold_moves. + The performance of our parser is made possible by an advance by Goldberg + and Nivre (2012), who showed that we’d been doing this wrong for years. + + p + | In the POS-tagging post, I cautioned that during training you need to + | make sure you pass in the last two + em predicted + | tags as features for the current tag, not the last two + em gold + | tags. At test time you’ll only have the predicted tags, so if you + | base your features on the gold sequence during training, your training + | contexts won’t resemble your test-time contexts, so you’ll learn the + | wrong weights. + + p. + In parsing, the problem was that we didn’t know + em how + | to pass in the predicted sequence! Training worked by taking the + | gold-standard tree, and finding a transition sequence that led to it. + | i.e., you got back a sequence of moves, with the guarantee that if + | you followed those moves, you’d get the gold-standard dependencies. + + p + | The problem is, we didn’t know how to define the “correct” move to + | teach a parser to make if it was in any state that + em wasn’t + | along that gold-standard sequence. Once the parser had made a mistake, + | we didn’t know how to train from that example. + + p + | That was a big problem, because it meant that once the parser started + | making mistakes, it would end up in states unlike any in its training + | data – leading to yet more mistakes. The problem was specific + | to greedy parsers: once you use a beam, there’s a natural way to do + | structured prediction. + p + | The solution seems obvious once you know it, like all the best breakthroughs. + | What we do is define a function that asks “How many gold-standard + | dependencies can be recovered from this state?”. If you can define + | that function, then you can apply each move in turn, and ask, “How + | many gold-standard dependencies can be recovered from + em this + | state?”. If the action you applied allows + em fewer + | gold-standard dependencies to be reached, then it is sub-optimal. + + p That’s a lot to take in. + + p + | So we have this function + code.language-python Oracle(state) + | : + pre + code + Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | + p + | We also have a set of actions, each of which returns a new state. + | We want to know: + + ul + li shift_cost = Oracle(state) – Oracle(shift(state)) + li right_cost = Oracle(state) – Oracle(right(state)) + li left_cost = Oracle(state) – Oracle(left(state)) + + p + | Now, at least one of those costs + em has + | to be zero. Oracle(state) is asking, “what’s the cost of the best + | path forward?”, and the first action of that best path has to be + | shift, right, or left. + + p + | It turns out that we can derive Oracle fairly simply for many transition + | systems. The derivation for the transition system we’re using, Arc + | Hybrid, is in Goldberg and Nivre (2013). + + p + | We’re going to implement the oracle as a function that returns the + | zero-cost moves, rather than implementing a function Oracle(state). + | This prevents us from doing a bunch of costly copy operations. + | Hopefully the reasoning in the code isn’t too hard to follow, but + | you can also consult Goldberg and Nivre’s papers if you’re confused + | and want to get to the bottom of this. + + pre.language-python + code + | def get_gold_moves(n0, n, stack, heads, gold): + | def deps_between(target, others, gold): + | for word in others: + | if gold[word] == target or gold[target] == word: + | return True + | return False + | + | valid = get_valid_moves(n0, n, len(stack)) + | if not stack or (SHIFT in valid and gold[n0] == stack[-1]): + | return [SHIFT] + | if gold[stack[-1]] == n0: + | return [LEFT] + | costly = set([m for m in MOVES if m not in valid]) + | # If the word behind s0 is its gold head, Left is incorrect + | if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: + | costly.add(LEFT) + | # If there are any dependencies between n0 and the stack, + | # pushing n0 will lose them. + | if SHIFT not in costly and deps_between(n0, stack, gold): + | costly.add(SHIFT) + | # If there are any dependencies between s0 and the buffer, popping + | # s0 will lose them. + | if deps_between(stack[-1], range(n0+1, n-1), gold): + | costly.add(LEFT) + | costly.add(RIGHT) + | return [m for m in MOVES if m not in costly] + + + + p + | Doing this “dynamic oracle” training procedure makes a big difference + | to accuracy — typically 1-2%, with no difference to the way the run-time + | works. The old “static oracle” greedy training procedure is fully + | obsolete; there’s no reason to do it that way any more. + + h3 Conclusion + + p + | I have the sense that language technologies, particularly those relating + | to grammar, are particularly mysterious. I can imagine having no idea + | what the program might even do. + + p + | I think it therefore seems natural to people that the best solutions + | would be over-whelmingly complicated. A 200,000 line Java package + | feels appropriate. + p + | But, algorithmic code is usually short, when only a single algorithm + | is implemented. And when you only implement one algorithm, and you + | know exactly what you want to write before you write a line, you + | also don’t pay for any unnecessary abstractions, which can have a + | big performance impact. + + h3 Notes + p + a(name='note-1') + | [1] I wasn’t really sure how to count the lines of code in the Stanford + | parser. Its jar file ships over 200k, but there are a lot of different + | models in it. It’s not important, but over 50k seems safe. + + p + a(name='note-2') + | [2] For instance, how would you parse, “John’s school of music calls”? + | You want to make sure the phrase “John’s school” has a consistent + | structure in both “John’s school calls” and “John’s school of music + | calls”. Reasoning about the different “slots” you can put a phrase + | into is a key way we reason about what syntactic analyses look like. + | You can think of each phrase as having a different shaped connector, + | which you need to plug into different slots — which each phrase also + | has a certain number of, each of a different shape. We’re trying to + | figure out what connectors are where, so we can figure out how the + | sentences are put together. + + h3 Idle speculation + p + | For a long time, incremental language processing algorithms were + | primarily of scientific interest. If you want to write a parser to + | test a theory about how the human sentence processor might work, well, + | that parser needs to build partial interpretations. There’s a wealth + | of evidence, including commonsense introspection, that establishes + | that we don’t buffer input and analyse it once the speaker has finished. + + p + | But now algorithms with that neat scientific feature are winning! + | As best as I can tell, the secret to that success is to be: + + ul + li Incremental. Earlier words constrain the search. + li + | Error-driven. Training involves a working hypothesis, which is + | updated as it makes mistakes. + + p + | The links to human sentence processing seem tantalising. I look + | forward to seeing whether these engineering breakthroughs lead to + | any psycholinguistic advances. + + h3 Bibliography + + p + | The NLP literature is almost entirely open access. All of the relavant + | papers can be found + a(href=urls.acl_anthology, rel='nofollow') here + | . + p + | The parser I’ve described is an implementation of the dynamic-oracle + | Arc-Hybrid system here: + + span.bib-item + | Goldberg, Yoav; Nivre, Joakim. + em Training Deterministic Parsers with Non-Deterministic Oracles + | . TACL 2013 + p + | However, I wrote my own features for it. The arc-hybrid system was + | originally described here: + + span.bib-item + | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic + | programming algorithms for transition-based dependency parsers. ACL 2011 + + p + | The dynamic oracle training method was first described here: + span.bib-item + | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; + | Nivre, Joakim. COLING 2012 + + p + | This work depended on a big break-through in accuracy for transition-based + | parsers, when beam-search was properly explored by Zhang and Clark. + | They have several papers, but the preferred citation is: + + span.bib-item + | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized + | Perceptron and Beam Search. Computational Linguistics 2011 (1) + p + | Another important paper was this little feature engineering paper, + | which further improved the accuracy: + + span.bib-item + | Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with + | Rich Non-local Features. ACL 2011 + + p + | The generalised perceptron, which is the learning framework for these + | beam parsers, is from this paper: + span.bib-item + | Collins, Michael. Discriminative Training Methods for Hidden Markov + | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 + + h3 Experimental details + p + | The results at the start of the post refer to Section 22 of the Wall + | Street Journal corpus. The Stanford parser was run as follows: + + pre.language-bash + code + | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ + | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* + + + + p + | A small post-process was applied, to undo the fancy tokenisation + | Stanford adds for numbers, to make them match the PTB tokenisation: + + pre.language-python + code + | """Stanford parser retokenises numbers. Split them.""" + | import sys + | import re + | + | qp_re = re.compile('\xc2\xa0') + | for line in sys.stdin: + | line = line.rstrip() + | if qp_re.search(line): + | line = line.replace('(CD', '(QP (CD', 1) + ')' + | line = line.replace('\xc2\xa0', ') (CD ') + | print line + + p + | The resulting PTB-format files were then converted into dependencies + | using the Stanford converter: + + pre.language-bash + code + | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp + | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ + | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll + p + | I can’t easily read that anymore, but it should just convert every + | .mrg file in a folder to a CoNLL-format Stanford basic dependencies + | file, using the settings common in the dependency literature. + + p + | I then converted the gold-standard trees from WSJ 22, for the evaluation. + | Accuracy scores refer to unlabelled attachment score (i.e. the head index) + | of all non-punctuation tokens. + + p + | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 + | into the same conversion script. + + p + | In a nutshell: The Stanford model and parser.py are trained on the + | same set of sentences, and they each make their predictions on a + | held-out test set, for which we know the answers. Accuracy refers + | to how many of the words’ heads we got correct. + + p + | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a + | server, to give the Stanford parser more memory. The parser.py system + | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; + | CPython was about half as fast on an early benchmark. + + p + | One of the reasons parser.py is so fast is that it does unlabelled + | parsing. Based on previous experiments, a labelled parser would likely + | be about 40x slower, and about 1% more accurate. Adapting the program + | to labelled parsing would be a good exercise for the reader, if you + | have access to the data. + + p + | The result from the Redshift parser was produced from commit + code.language-python b6b624c9900f3bf + | , which was run as follows: + pre.language-python. + footer.meta(role='contentinfo') + a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter + .discuss + a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News + a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit + footer(role='contentinfo') + script(src='js/prism.js') + From 2f5028881393979d0a4821487592e45c69395ae6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 13 Aug 2015 14:43:56 +0200 Subject: [PATCH 066/138] * Add parser post in jade --- docs/redesign/blog_parser.jade | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade index 34d312a1c..5806c677f 100644 --- a/docs/redesign/blog_parser.jade +++ b/docs/redesign/blog_parser.jade @@ -87,7 +87,7 @@ html(lang='en') td 19 td Java td - | > 50,000 + | > 4,000 sup a(href='#note-1') [1] tr @@ -96,7 +96,6 @@ html(lang='en') td 89.8% td 2,020 td Python - td strong ~500 tr td Redshift @@ -651,11 +650,11 @@ html(lang='en') p | So we have this function - code.language-python Oracle(state) + code Oracle(state) | : pre code - Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | + | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | p | We also have a set of actions, each of which returns a new state. | We want to know: @@ -745,7 +744,7 @@ html(lang='en') a(name='note-1') | [1] I wasn’t really sure how to count the lines of code in the Stanford | parser. Its jar file ships over 200k, but there are a lot of different - | models in it. It’s not important, but over 50k seems safe. + | models in it. It’s not important, but it's certainly over 4k. p a(name='note-2') From 005074c31eec65984570579b047dce000509c228 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 13 Aug 2015 15:49:33 +0200 Subject: [PATCH 067/138] * Add post introducing spaCy --- docs/redesign/blog_intro.jade | 93 +++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 docs/redesign/blog_intro.jade diff --git a/docs/redesign/blog_intro.jade b/docs/redesign/blog_intro.jade new file mode 100644 index 000000000..8b56d7daf --- /dev/null +++ b/docs/redesign/blog_intro.jade @@ -0,0 +1,93 @@ +- + var urls = { + 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', + 'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", + 'implementation': 'https://gist.github.com/syllog1sm/10343947', + 'redshift': 'http://github.com/syllog1sm/redshift', + 'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', + 'acl_anthology': 'http://aclweb.org/anthology/', + 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' + } + + +- var my_research_software = 'my research software' + +- var how_to_write_a_POS_tagger = 'how to write a part-of-speech tagger' + +- var parser_lnk = 'parser' + +- var buy_a_commercial_license = 'buy a commercial license' + +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + article.post + p. + spaCy is a new library for text processing in Python + and Cython. I wrote it because I think small companies are terrible at + natural language processing (NLP). Or rather: small companies are using + terrible NLP technology. + + p. + To do great NLP, you have to know a little about linguistics, a lot + about machine learning, and almost everything about the latest research. + The people who fit this description seldom join small companies. + Most are broke – they've just finished grad school. + If they don't want to stay in academia, they join Google, IBM, etc. + + p. + The net result is that outside of the tech giants, commercial NLP has + changed little in the last ten years. In academia, it's changed entirely. + Amazing improvements in quality. Orders of magnitude faster. But the + academic code is always GPL, undocumented, unuseable, or all three. + You could implement the ideas yourself, but the papers are hard to read, + and training data is exorbitantly expensive. So what are you left with? + A common answer is NLTK, which was written primarily as an educational resource. + Nothing past the tokenizer is suitable for production use. + + p. + I used to think that the NLP community just needed to do more to communicate + its findings to software engineers. So I wrote two blog posts, explaining + !{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well + received, and there's been a bit of interest in !{my_research_software} + – even though it's entirely undocumented, and mostly unuseable to + anyone but me. + p. + So six months ago I quit my post-doc, and I've been working day and night + on spaCy since. I'm now pleased to announce an alpha release. + + p. + If you're a small company doing NLP, I think spaCy will seem like a minor + miracle. It's by far the fastest NLP software ever released. The + full processing pipeline completes in 20ms per document, including accurate + tagging and parsing. All strings are mapped to integer IDs, tokens are + linked to embedded word representations, and a range of useful features + are pre-calculated and cached. + + p. + If none of that made any sense to you, here's the gist of it. Computers + don't understand text. This is unfortunate, because that's what the + web almost entirely consists of. We want to recommend people text based + on other text they liked. We want to shorten text to display it on a + mobile screen. We want to aggregate it, link it, filter it, categorise + it, generate it and correct it. + + p. + spaCy provides a library of utility functions that help programmers + build such products. It's commercial open source software: you can + either use it under the AGPL, or you can !{buy_a_commercial_license} + under generous terms. + + footer(role='contentinfo') From 5ee645d742193293f8933de1ed63cd44a08aba70 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Aug 2015 18:59:08 +0200 Subject: [PATCH 068/138] * Add Displacy mixin. Needs to pull the data from the web --- docs/redesign/online_demo.jade | 18 +++++ examples/twitter_filter.py | 141 +++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 examples/twitter_filter.py diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade index e69de29bb..0e2bbb331 100644 --- a/docs/redesign/online_demo.jade +++ b/docs/redesign/online_demo.jade @@ -0,0 +1,18 @@ +mixin Displacy(sentence, caption_text, height) + - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20") + + .displacy + iframe.displacy(src="displacy/displacy_demo.html" height=height) + + a.view-displacy(href=url) + | View on displaCy + + p.caption. + #{caption_text} + + ++Displacy( + "Click the button to see this sentence in displaCy.", + "The best parse-tree visualizer and annotation tool in all the land.", + 275 +) diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py new file mode 100644 index 000000000..f842acdd4 --- /dev/null +++ b/examples/twitter_filter.py @@ -0,0 +1,141 @@ +from __future__ import unicode_literals, print_function +import plac +import codecs +import sys +import math + +import spacy.en +from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ + +from termcolor import colored +from twython import TwythonStreamer + +from os import path +from math import sqrt + +from numpy import dot +from numpy.linalg import norm + + +class Meaning(object): + def __init__(self, vectors): + if vectors: + self.vector = sum(vectors) / len(vectors) + self.norm = norm(self.vector) + else: + self.vector = None + self.norm = 0 + + @classmethod + def from_path(cls, nlp, loc): + with codecs.open(loc, 'r', 'utf8') as file_: + terms = file_.read().strip().split() + return cls.from_terms(nlp, terms) + + @classmethod + def from_tokens(cls, nlp, tokens): + vectors = [t.repvec for t in tokens] + return cls(vectors) + + @classmethod + def from_terms(cls, nlp, examples): + lexemes = [nlp.vocab[eg] for eg in examples] + vectors = [eg.repvec for eg in lexemes] + return cls(vectors) + + def similarity(self, other): + if not self.norm or not other.norm: + return -1 + return dot(self.vector, other.vector) / (self.norm * other.norm) + + +def print_colored(model, stream=sys.stdout): + if model['is_match']: + color = 'green' + elif model['is_reject']: + color = 'red' + else: + color = 'grey' + + if not model['is_rare'] and model['is_match'] and not model['is_reject']: + match_score = colored('%.3f' % model['match_score'], 'green') + reject_score = colored('%.3f' % model['reject_score'], 'red') + prob = '%.5f' % model['prob'] + + print(match_score, reject_score, prob) + print(repr(model['text']), color) + print('') + + +class TextMatcher(object): + def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): + self.nlp = nlp + self.get_target = get_target + self.get_reject = get_reject + self.min_prob = min_prob + self.min_match = min_match + self.max_reject = max_reject + + def __call__(self, text): + tweet = self.nlp(text) + target_terms = self.get_target() + reject_terms = self.get_reject() + + prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) + meaning = Meaning.from_tokens(self, tweet) + + match_score = meaning.similarity(self.get_target()) + reject_score = meaning.similarity(self.get_reject()) + return { + 'text': tweet.string, + 'prob': prob, + 'match_score': match_score, + 'reject_score': reject_score, + 'is_rare': prob < self.min_prob, + 'is_match': prob >= self.min_prob and match_score >= self.min_match, + 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject + } + + +class Connection(TwythonStreamer): + def __init__(self, keys_dir, handler, view): + keys = Secrets(keys_dir) + TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) + self.handler = handler + self.view = view + + def on_success(self, data): + text = data.get('text', u'') + # Twython returns either bytes or unicode, depending on tweet. + # #APIshaming + try: + model = self.handler(text) + except TypeError: + model = self.handler(text.decode('utf8')) + status = self.view(model, sys.stdin) + + def on_error(self, status_code, data): + print(status_code) + + +class Secrets(object): + def __init__(self, key_dir): + self.key = open(path.join(key_dir, 'key.txt')).read().strip() + self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() + self.token = open(path.join(key_dir, 'token.txt')).read().strip() + self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() + + +def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): + # We don't need the parser for this demo, so may as well save the loading time + nlp = spacy.en.English(Parser=None) + get_target = lambda: Meaning.from_path(nlp, target_loc) + get_reject = lambda: Meaning.from_path(nlp, reject_loc) + matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) + + twitter = Connection(keys_dir, matcher, print_colored) + twitter.statuses.filter(track=term) + + +if __name__ == '__main__': + plac.call(main) From c9b19a9c0012efbc227ec2a2aacd90bf4675d5f9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Aug 2015 20:13:22 +0200 Subject: [PATCH 069/138] * Work on website --- docs/redesign/api.jade | 661 ++++++++++ docs/redesign/blog.jade | 135 ++ docs/redesign/blog_intro.jade | 122 +- docs/redesign/blog_parser.jade | 1720 +++++++++++++------------ docs/redesign/change_log.jade | 0 docs/redesign/comparisons.jade | 78 ++ docs/redesign/docs.jade | 622 +-------- docs/redesign/home.jade | 27 +- docs/redesign/installation.jade | 97 +- docs/redesign/license.jade | 179 +++ docs/redesign/mixins.jade | 19 + docs/redesign/outline.jade | 8 +- docs/redesign/spec.jade | 178 +-- docs/redesign/template_post.jade | 31 + docs/redesign/tute_adverbs.jade | 200 +++ docs/redesign/tute_syntax_search.jade | 132 ++ docs/redesign/tute_twitter.jade | 204 +++ docs/redesign/tutorials.jade | 0 docs/redesign/usage_examples.jade | 9 +- 19 files changed, 2762 insertions(+), 1660 deletions(-) create mode 100644 docs/redesign/api.jade create mode 100644 docs/redesign/blog.jade create mode 100644 docs/redesign/change_log.jade create mode 100644 docs/redesign/comparisons.jade create mode 100644 docs/redesign/license.jade create mode 100644 docs/redesign/mixins.jade create mode 100644 docs/redesign/template_post.jade create mode 100644 docs/redesign/tute_adverbs.jade create mode 100644 docs/redesign/tute_syntax_search.jade create mode 100644 docs/redesign/tute_twitter.jade create mode 100644 docs/redesign/tutorials.jade diff --git a/docs/redesign/api.jade b/docs/redesign/api.jade new file mode 100644 index 000000000..0bc956ce1 --- /dev/null +++ b/docs/redesign/api.jade @@ -0,0 +1,661 @@ +mixin declare_class(name) + details + summary + span.declaration + span.label class + code #{name} + block + +mixin method(name, parameters) + details(open=attributes.open) + summary + span.declaration + span.label #{name} + span.parameters + | self, #{parameters} + block + + +mixin params + ul + block + + +mixin param(name, type, value) + li + if type + #{name} (!{type}) – + else + #{name} – + block + + +mixin attribute(name, type, value) + details(open=attributes.open) + summary + span.declaration + span.label #{name} + block + + +mixin returns(name, type, value) + li + if type + #{name} (!{type}) – + else + #{name} – + block + + +mixin returns(type) + | tmp + +mixin init + details + summary: h4 Init + + block + + +mixin callable + details + summary: h4 Callable + + block + + +mixin sequence + details + summary: h4 Sequence + + block + + +mixin maptype + details + summary: h4 Map + + block + + +mixin summary + block + +mixin en_example + pre.language-python + code + | from spacy.en import English + | from spacy._doc_examples import download_war_and_peace + | + | unprocessed_unicode = download_war_and_peace() + | + | nlp = English() + | doc = nlp(unprocessed_unicode) + + ++declare_class("English") + p Load models into a callable object to process English text. + + +summary + +en_example + + +init + p + | Load the resources. Loading takes 20 seconds, and the instance + | consumes 2 to 3 gigabytes of memory. + + p + | Intended use is for one instance to be created per process. + | You can create more if you're doing something unusual. + p + | You may wish to make the instance a global variable or "singleton". + | We usually instantiate the object in the main() + | function and pass it around as an explicit argument. + +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") + + +params + +param("data_dir") + | The data directory. May be #{None}, to disable any data loading + | (including the vocabulary). + + +param("Tokenizer") + | A class/function that creates the tokenizer. + + +param("Tagger") + | A class/function that creates the part-of-speech tagger. + + +param("Parser") + | A class/function that creates the dependency parser. + + +param("Entity") + | A class/function that creates the named entity recogniser. + + +param("load_vectors") + | A boolean value to control whether the word vectors are loaded. + + +callable + +method("__call__", "text, tag=True, parse=True, entity=True") + + +params + +param("text", types.unicode) + | The text to be processed. No pre-processing needs to be applied, + | and any length of text can be submitted. Usually you will submit + | a whole document. Text may be zero-length. An exception is raised + | if byte strings are supplied. + + +param("tag", types.bool) + | Whether to apply the part-of-speech tagger. Required for parsing + | and entity recognition. + + +param("parse", types.bool) + | Whether to apply the syntactic dependency parser. + + +param("entity", types.bool) + | Whether to apply the named entity recognizer. + + pre.language-python + code + | from spacy.en import English + | nlp = English() + | doc = nlp(u'Some text.) # Applies tagger, parser, entity + | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser + | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity + | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser + | doc = nlp(u'') # Zero-length tokens, not an error + | # doc = nlp(b'Some text') <-- Error: need unicode + | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. + + ++declare_class("Doc") + p I'm a doc + + +init + +method("__init__", "vocab") + +params + +param("vocab", vocab_type) + | A vocabulary object + + +sequence + +method("__getitem__", "i", types.int) + +returns(types.Token) + + +method("__getitem__", "start_end", types.slice) + +returns(types.Span) + + +method("__iter__") + | Iterate over tokens + + +method("__len__") + | Number of tokens in the document. + + details + summary: h4 Spans + + +attribute("sents", types.generator) + | Iterate over sentences in the document. + + +attribute("ents", types.generator) + | Iterate over named entities in the document. + + +attribute("noun_chunks", types.generator) + + details + summary: h4 Export/Import + + +method("to_array", "attr_ids") + + | Given a list of M attribute IDs, export the tokens to a numpy ndarray + | of shape N*M, where N is the length of the sentence. + + +params + +param("attr_ids", "list[int]") + | A list of attribute ID ints. + + +returns("feat_array") + | A feature matrix, with one row per word, and one column per attribute + | indicated in the input attr_ids. + + +method("count_by", "attr_id") + | Produce a dict of {attribute (int): count (ints)} frequencies, keyed + | by the values of the given attribute ID. + + pre.language-python + code + | >>> from spacy.en import English, attrs + | >>> nlp = English() + | >>> tokens = nlp(u'apple apple orange banana') + | >>> tokens.count_by(attrs.ORTH) + | {12800L: 1, 11880L: 2, 7561L: 1} + | >>> tokens.to_array([attrs.ORTH]) + | array([[11880], + | [11880], + | [7561], + | [12800]]) + + +method("from_array", "attrs, array") + | Load from array + + +method("from_bytes") + | Deserialize, loading from bytes + + +method("read_bytes") + | classmethod + + //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") + + // | Merge a multi-word expression into a single token. Currently + // | experimental; API is likely to change. + + ++declare_class("Token") + +init + +method("__init__", "vocab, doc, offset") + +params + +param("vocab", types.Vocab) + p A Vocab object + + +param("doc", types.Doc) + p The parent sequence + + +param("offset", types.int) + p The index of the token within the document + + details + summary: h4 String Views + + +attribute("orth / orth_") + | The form of the word with no string normalization or processing, as + | it appears in the string, without trailing whitespace. + + +attribute("lemma / lemma_") + | The "base" of the word, with no inflectional suffixes, e.g. the lemma of + | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that + | derivational suffixes are not stripped, e.g. the lemma of + | "instutitions" is "institution", not "institute". Lemmatization is + | performed using the WordNet data, but extended to also cover closed-class + | words such as pronouns. By default, the WN lemmatizer returns "hi" + | as the lemma of "his". We assign pronouns the lemma -PRON-. + + +attribute("lower / lower_") + | The form of the word, but forced to lower-case, i.e. + pre.language-python: code lower = word.orth\_.lower() + + //+attribute("norm / norm_") + // | The form of the word, after language-specific normalizations has been + // | applied. + + +attribute("shape / shape_") + | A transform of the word's string, to show orthographic features. + | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped + | to d. After these mappings, sequences of 4 or more of the same character + | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, + | :) --> :) + + +attribute("prefix / prefix_") + | A length-N substring from the start of the word. Length may vary by + | language; currently for English n=1, i.e. + pre.language-python: code prefix = word.orth\_[:1] + + +attribute("suffix / suffix_") + | A length-N substring from the end of the word. Length may vary by + | language; currently for English n=3, i.e. + pre.language-python: code suffix = word.orth\_[-3:] + + //+attribute("lex_id") + // | lex_id + + details + summary: h4 Alignment and Output + + +attribute("idx") + p Start index of the token in the string + + +method("__len__", "") + p Length of the token's orth string, in unicode code-points. + + +method("__unicode__", "") + p Same as token.orth_ + + +method("__str__", "") + p Varies between Python 2 and Python 3 + + +attribute("string") + p + | The form of the word as it appears in the string, including + | trailing whitespace. This is useful when you need to use + | linguistic features to add inline mark-up to the string. + + +method("nbor, i=1") + +params + +param("i") + p Offset relative to token + + details + summary: h4 Distributional Features + + +attribute("repvec") + p + | A "word embedding" representation: a dense real-valued vector that supports + | similarity queries between words. By default, spaCy currently loads + | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec + | model. + + +attribute("cluster") + p + | The Brown cluster ID of the word. These are often useful features for + | linear models. If you're using a non-linear model, particularly a + | neural net or random forest, consider using the real-valued word + | representation vector, in Token.repvec, instead. + + +attribute("prob") + p + | The unigram log-probability of the word, estimated from counts from a + | large corpus, smoothed using Simple Good Turing estimation. + + details + summary: h4 Syntactic Tags + + +attribute("pos / pos_") + p + | A part-of-speech tag, from the Google Universal Tag Set, e.g. + | code>NOUN, VERB, ADV. Constants for + | the 17 tag values are provided in spacy.parts_of_speech. + + +attribute("tag / tag_") + p + | A morphosyntactic tag, e.g. NN, VBZ, + | DT, etc. These tags are language/corpus specific, and + | typically describe part-of-speech and some amount of morphological + | information. For instance, in the Penn Treebank tag set, VBZ + | is assigned to a present-tense singular verb. + + +attribute("dep / dep_") + p + | The type of syntactic dependency relation between the word and its + | syntactic head. + + details + summary: h4 Navigating the Parse Tree + + +attribute("head") + p + | The Token that is the immediate syntactic head of the word. If the + | word is the root of the dependency tree, the same word is returned. + + +attribute("lefts") + p + | An iterator for the immediate leftward syntactic children of the + | word. + + +attribute("rights") + p + | An iterator for the immediate rightward syntactic children of the + | word. + + +attribute("n_lefts") + p + | The number of immediate syntactic children preceding the word in + | the string. + + +attribute("n_rights") + p + | The number of immediate syntactic children following the word in + | the string. + + +attribute("children") + p + | An iterator that yields from lefts, and then yields from rights. + + +attribute("subtree") + p + | An iterator for the part of the sentence syntactically governed by + | the word, including the word itself. + + +attribute("left_edge") + p The leftmost edge of the token's subtree + + +attribute("right_edge") + p The rightmost edge of the token's subtree + + details + summary: h4 Named Entities + + +attribute("ent_type") + p If the token is part of an entity, its entity type. + + +attribute("ent_iob") + p The IOB (inside, outside, begin) entity recognition tag for the token. + + details + summary: h4 Lexeme Flags + + +method("check_flag", "flag_id") + +params + +param("flag_id") + | flag ID + + +attribute("is_oov") + +attribute("is_alpha") + +attribute("is_ascii") + +attribute("is_digit") + +attribute("is_lower") + +attribute("is_title") + +attribute("is_punct") + +attribute("is_space") + +attribute("like_url") + +attribute("like_num") + +attribute("like_email") + + //+attribute("conjuncts") + // | Conjuncts + ++declare_class("Span") + +init + +method("__init__") + Temp + + span = doc[0:4] + + +sequence + +method("__getitem__") + p Get item + + +method("__iter__") + p Iter + + +method("__len__") + p Len + + details + summary: h4 Parse + + +attribute("root") + p Syntactic head + + +attribute("lefts") + p Tokens that are: + ol + li To the left of the span; + li Syntactic children of words within the span + + p i.e. + + pre.language-python + code + | lefts = [span.doc[i] for i in range(0, span.start) + | if span.doc[i].head in span] + + +attribute("rights") + p Tokens that are: + ol + li To the right of the span; + li Syntactic children of words within the span + p i.e. + pre.language-python + code + | rights = [span.doc[i] for i in range(span.end, len(span.doc)) + | if span.doc[i].head in span] + + + +attribute("subtree") + p String + + details + summary: h4 String Views + + +attribute("string") + p String + + +attribute("lemma / lemma_") + p String + + +attribute("label / label_") + p String + ++declare_class("Lexeme") + p + | The Lexeme object represents a lexical type, stored in the vocabulary + | – as opposed to a token, occurring in a document. + p + | Lexemes store various features, so that these features can be computed + | once per type, rather than once per token. As job sizes grow, this + | can amount to a substantial efficiency improvement. + + p + | All Lexeme attributes are therefore context independent, as a single + | lexeme is reused for all usages of that word. Lexemes are keyed by + | the “orth” attribute. + + p + All Lexeme attributes are accessible directly on the Token object. + + +init + +method("__init__") + p Init + + details + summary: h4 String Features + + +attribute("orth / orth_") + p + | The form of the word with no string normalization or processing, + | as it appears in the string, without trailing whitespace. + + +attribute("lower / lower_") + p Tmp + + +attribute("norm / norm_") + p Tmp + + +attribute("shape / shape_") + p Tmp + + +attribute("prefix / prefix_") + p Tmp + + +attribute("suffix / suffix_") + p TMP + ++declare_class("Vocab", "data_dir=None, lex_props_getter=None") + +sequence + +method("__len__") + +returns + p Number of words in the vocabulary. + + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key") + p Integer ID + + +returns: p A Lexeme object + + +method("__getitem__", "key_str") + +params + +param("key_str", types.unicode) + p A string in the vocabulary + + +returns("Lexeme") + + +method("__setitem__", "orth_str", "props") + +params + +param("orth_str", types.unicode) + p The orth key + + +param("props", types.dict) + p A props dictionary + + +returns("None") + + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc", types.unicode) + p Path where the vocabulary should be saved + + +method("load_lexemes", "loc") + +params + +param("loc", types.unicode) + p Path to load the lexemes.bin file from + + +method("load_vectors", "loc") + +params + +param("loc", types.unicode) + p Path to load the vectors.bin from + ++declare_class("StringStore") + +init + Tmp + + +sequence + +method("__len__") + +returns("int") + p Number of strings in the string-store + + +method("__iter__") + +returns + p Lexeme + + +maptype + +method("__getitem__", "key_int") + +params + +param("key_int") + p An integer key + + +returns(types.unicode) + p The string that the integer key maps to + + +method("__getitem__", "key_unicode") + +params + +param("key_unicode") + p A key, as a unicode string + + +returns(types.int) + p The integer ID of the string. + + +method("__getitem__", "key_utf8_bytes") + +params + +param("key_utf8_bytes", types.bytes) + p p A key, as a UTF-8 encoded byte-string + + +returns(types.int) + p The integer ID of the string. + + details + summary: h4 Import/Export + + +method("dump", "loc") + +params + +param("loc") + p File path to save the strings.txt to. + + +method("load") + +params + +param("loc") + p File path to load the strings.txt from. diff --git a/docs/redesign/blog.jade b/docs/redesign/blog.jade new file mode 100644 index 000000000..119a5aad9 --- /dev/null +++ b/docs/redesign/blog.jade @@ -0,0 +1,135 @@ +mixin Teaser(title, url, date_long, date_short, author, lede) + article.post + header + h2 + a(href=url)= title + .subhead + | by + a(href='#', rel='author')= author + | on + time(datetime=date_short)= date_long + p!= lede +   + a.readmore(href='#') ► + + +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + + nav(role="navigation") + ul + li: a(href="home.html") Home + li: a(href="docs.html") Docs + li.active: a(href="blog.html") Blog + li: a(href="license.html") License + + main#content(role='main') + section.intro.profile + p + img(src='img/matt.png') + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. + span.social + a(href='#') Follow me on Twitter + nav(role='navigation') + ul + li + a.button(href='#') Blog + li + a.button(href='#tutorials') Tutorials + section.blogs + +Teaser( + "Introducing spaCy", + "blog_intro.html", + "February 2015", + "2015-02-18", + "Matthew Honnibal", + "spaCy is a new library for text processing in Python " + + "and Cython. I wrote it because I think small companies are terrible at " + + "natural language processing (NLP). Or rather: small companies are using " + + "terrible NLP technology." + ) + + +Teaser( + "Parsing English with 500 lines of Python", + "blog_parser.html", + "December 18, 2013", + "2013-12-18", + "Matthew Hannibal", + "The Natural Language Processing (NLP) community has made big progress" + + "in syntactic parsing over the last few years. It’s now possible for a" + + "tiny Python implementation to perform better than the widely-used Stanford " + + "PCFG parser.") + + article.post + header + h2 + a(href='#') Another headline + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2013-12-18') December 18, 2013 + p + | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. + a.readmore(href='#') ► + article.post + header + h2 + a(href='#') Another headline + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2013-12-18') December 18, 2013 + p + | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. + a.readmore(href='#') ► + article.post + header + h2 + a(href='#') Another headline + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2013-12-18') December 18, 2013 + p + | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. + a.readmore(href='#') ► + .readmore + a.button(href='#') Read more posts + section.intro + h2 + a.permalink(href='#tutorials', name='tutorials') Tutorials + p + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est. + section.tutorials + details + summary + h4 Tutorial #1: How to do something cool + p + | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. + a.readmore(href='#') ► + details + summary + h4 Tutorial #2 + details + summary + h4 Tutorial #3 + + footer(role="contentinfo") + span.slogan.copyright © 2015 Syllogism Co. + + script(src='js/prism.js') diff --git a/docs/redesign/blog_intro.jade b/docs/redesign/blog_intro.jade index 8b56d7daf..15112f587 100644 --- a/docs/redesign/blog_intro.jade +++ b/docs/redesign/blog_intro.jade @@ -1,3 +1,5 @@ +extends ./template_post.jade + - var urls = { 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', @@ -9,7 +11,6 @@ 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' } - - var my_research_software = 'my research software' - var how_to_write_a_POS_tagger = 'how to write a part-of-speech tagger' @@ -18,76 +19,63 @@ - var buy_a_commercial_license = 'buy a commercial license' -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - main#content(role='main') - article.post - p. - spaCy is a new library for text processing in Python - and Cython. I wrote it because I think small companies are terrible at - natural language processing (NLP). Or rather: small companies are using - terrible NLP technology. - p. - To do great NLP, you have to know a little about linguistics, a lot - about machine learning, and almost everything about the latest research. - The people who fit this description seldom join small companies. - Most are broke – they've just finished grad school. - If they don't want to stay in academia, they join Google, IBM, etc. +block body_block + article.post + p. + spaCy is a new library for text processing in Python + and Cython. I wrote it because I think small companies are terrible at + natural language processing (NLP). Or rather: small companies are using + terrible NLP technology. - p. - The net result is that outside of the tech giants, commercial NLP has - changed little in the last ten years. In academia, it's changed entirely. - Amazing improvements in quality. Orders of magnitude faster. But the - academic code is always GPL, undocumented, unuseable, or all three. - You could implement the ideas yourself, but the papers are hard to read, - and training data is exorbitantly expensive. So what are you left with? - A common answer is NLTK, which was written primarily as an educational resource. - Nothing past the tokenizer is suitable for production use. + p. + To do great NLP, you have to know a little about linguistics, a lot + about machine learning, and almost everything about the latest research. + The people who fit this description seldom join small companies. + Most are broke – they've just finished grad school. + If they don't want to stay in academia, they join Google, IBM, etc. - p. - I used to think that the NLP community just needed to do more to communicate - its findings to software engineers. So I wrote two blog posts, explaining - !{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well - received, and there's been a bit of interest in !{my_research_software} - – even though it's entirely undocumented, and mostly unuseable to - anyone but me. - p. - So six months ago I quit my post-doc, and I've been working day and night - on spaCy since. I'm now pleased to announce an alpha release. - - p. - If you're a small company doing NLP, I think spaCy will seem like a minor - miracle. It's by far the fastest NLP software ever released. The - full processing pipeline completes in 20ms per document, including accurate - tagging and parsing. All strings are mapped to integer IDs, tokens are - linked to embedded word representations, and a range of useful features - are pre-calculated and cached. + p. + The net result is that outside of the tech giants, commercial NLP has + changed little in the last ten years. In academia, it's changed entirely. + Amazing improvements in quality. Orders of magnitude faster. But the + academic code is always GPL, undocumented, unuseable, or all three. + You could implement the ideas yourself, but the papers are hard to read, + and training data is exorbitantly expensive. So what are you left with? + A common answer is NLTK, which was written primarily as an educational resource. + Nothing past the tokenizer is suitable for production use. - p. - If none of that made any sense to you, here's the gist of it. Computers - don't understand text. This is unfortunate, because that's what the - web almost entirely consists of. We want to recommend people text based - on other text they liked. We want to shorten text to display it on a - mobile screen. We want to aggregate it, link it, filter it, categorise - it, generate it and correct it. + p. + I used to think that the NLP community just needed to do more to communicate + its findings to software engineers. So I wrote two blog posts, explaining + !{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well + received, and there's been a bit of interest in !{my_research_software} + – even though it's entirely undocumented, and mostly unuseable to + anyone but me. + p. + So six months ago I quit my post-doc, and I've been working day and night + on spaCy since. I'm now pleased to announce an alpha release. + + p. + If you're a small company doing NLP, I think spaCy will seem like a minor + miracle. It's by far the fastest NLP software ever released. The + full processing pipeline completes in 20ms per document, including accurate + tagging and parsing. All strings are mapped to integer IDs, tokens are + linked to embedded word representations, and a range of useful features + are pre-calculated and cached. - p. - spaCy provides a library of utility functions that help programmers - build such products. It's commercial open source software: you can - either use it under the AGPL, or you can !{buy_a_commercial_license} - under generous terms. + p. + If none of that made any sense to you, here's the gist of it. Computers + don't understand text. This is unfortunate, because that's what the + web almost entirely consists of. We want to recommend people text based + on other text they liked. We want to shorten text to display it on a + mobile screen. We want to aggregate it, link it, filter it, categorise + it, generate it and correct it. + + p. + spaCy provides a library of utility functions that help programmers + build such products. It's commercial open source software: you can + either use it under the AGPL, or you can !{buy_a_commercial_license} + under generous terms. footer(role='contentinfo') diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade index 5806c677f..4930d8d26 100644 --- a/docs/redesign/blog_parser.jade +++ b/docs/redesign/blog_parser.jade @@ -1,922 +1,938 @@ -- - var urls = { - 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', - 'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", - 'implementation': 'https://gist.github.com/syllog1sm/10343947', - 'redshift': 'http://github.com/syllog1sm/redshift', - 'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', - 'acl_anthology': 'http://aclweb.org/anthology/', - 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' - } +extends ./template_post.jade -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - main#content(role='main') - article.post - header - h2 Parsing English with 500 lines of Python - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | A - a(href=urls.google_ngrams) syntactic parser - | describes a sentence’s grammatical structure, to help another - | application reason about it. Natural languages introduce many unexpected - | ambiguities, which our world-knowledge immediately filters out. A - | favourite example: +block body_block + - var urls = {} + //- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/' + - urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html" + - urls.implementation = 'https://gist.github.com/syllog1sm/10343947' + - urls.redshift = 'http://github.com/syllog1sm/redshift' + - urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm' + - urls.acl_anthology = 'http://aclweb.org/anthology/' + - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" - p.example They ate the pizza with anchovies + // A comment - p - img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') - p - | A correct parse links “with” to “pizza”, while an incorrect parse - | links “with” to “eat”: + article.post + header + h2 Parsing English with 500 lines of Python + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2013-12-18') December 18, 2013 + p + | A + a(href=urls.google_ngrams) syntactic parser + | describes a sentence’s grammatical structure, to help another + | application reason about it. Natural languages introduce many unexpected + | ambiguities, which our world-knowledge immediately filters out. A + | favourite example: - .displacy - iframe(src='displacy/anchovies_bad.html', height='275') + p.example They ate the pizza with anchovies - .displacy - iframe.displacy(src='displacy/anchovies_good.html', height='275') - a.view-displacy(href='#') View on displaCy - p.caption - | The Natural Language Processing (NLP) community has made big progress - | in syntactic parsing over the last few years. + p + img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') + p + | A correct parse links “with” to “pizza”, while an incorrect parse + | links “with” to “eat”: - p - | The Natural Language Processing (NLP) community has made big progress - | in syntactic parsing over the last few years. It’s now possible for - | a tiny Python implementation to perform better than the widely-used - | Stanford PCFG parser. + .displacy + iframe(src='displacy/anchovies_bad.html', height='275') - p - strong Update! - | The Stanford CoreNLP library now includes a greedy transition-based - | dependency parser, similar to the one described in this post, but with - | an improved learning strategy. It is much faster and more accurate - | than this simple Python implementation. + .displacy + iframe.displacy(src='displacy/anchovies_good.html', height='275') + a.view-displacy(href='#') View on displaCy + p.caption + | The Natural Language Processing (NLP) community has made big progress + | in syntactic parsing over the last few years. - table - thead - tr - th Parser - th Accuracy - th Speed (w/s) - th Language - th LOC - tbody - tr - td Stanford - td 89.6% - td 19 - td Java - td - | > 4,000 - sup - a(href='#note-1') [1] - tr - td - strong parser.py - td 89.8% - td 2,020 - td Python - strong ~500 - tr - td Redshift - td - strong 93.6% - td - strong 2,580 - td Cython - td ~4,000 - p - | The rest of the post sets up the problem, and then takes you through - a(href=urls.implementation) a concise implementation - | , prepared for this post. The first 200 lines of parser.py, the - | part-of-speech tagger and learner, are described - a(href=pos_tagger_url) here. You should probably at least skim that - | post before reading this one, unless you’re very familiar with NLP - | research. - p - | The Cython system, Redshift, was written for my current research. I - | plan to improve it for general use in June, after my contract ends - | at Macquarie University. The current version is - a(href=urls.redshift) hosted on GitHub - | . - h3 Problem Description + p + | The Natural Language Processing (NLP) community has made big progress + | in syntactic parsing over the last few years. It’s now possible for + | a tiny Python implementation to perform better than the widely-used + | Stanford PCFG parser. - p It’d be nice to type an instruction like this into your phone: + p + strong Update! + | The Stanford CoreNLP library now includes a greedy transition-based + | dependency parser, similar to the one described in this post, but with + | an improved learning strategy. It is much faster and more accurate + | than this simple Python implementation. - p.example - Set volume to zero when I’m in a meeting, unless John’s school calls. - p - | And have it set the appropriate policy. On Android you can do this - | sort of thing with - a(href=urls.tasker) Tasker - | , but an NL interface would be much better. It’d be especially nice - | to receive a meaning representation you could edit, so you could see - | what it thinks you said, and correct it. - p - | There are lots of problems to solve to make that work, but some sort - | of syntactic representation is definitely necessary. We need to know that: + table + thead + tr + th Parser + th Accuracy + th Speed (w/s) + th Language + th LOC + tbody + tr + td Stanford + td 89.6% + td 19 + td Java + td + | > 4,000 + sup + a(href='#note-1') [1] + tr + td + strong parser.py + td 89.8% + td 2,020 + td Python + strong ~500 + tr + td Redshift + td + strong 93.6% + td + strong 2,580 + td Cython + td ~4,000 + p + | The rest of the post sets up the problem, and then takes you through + a(href=urls.implementation) a concise implementation + | , prepared for this post. The first 200 lines of parser.py, the + | part-of-speech tagger and learner, are described + a(href=pos_tagger_url) here. You should probably at least skim that + | post before reading this one, unless you’re very familiar with NLP + | research. + p + | The Cython system, Redshift, was written for my current research. I + | plan to improve it for general use in June, after my contract ends + | at Macquarie University. The current version is + a(href=urls.redshift) hosted on GitHub + | . + h3 Problem Description - p.example - Unless John’s school calls, when I’m in a meeting, set volume to zero + p It’d be nice to type an instruction like this into your phone: - p is another way of phrasing the first instruction, while: + p.example + Set volume to zero when I’m in a meeting, unless John’s school calls. + p + | And have it set the appropriate policy. On Android you can do this + | sort of thing with + a(href=urls.tasker) Tasker + | , but an NL interface would be much better. It’d be especially nice + | to receive a meaning representation you could edit, so you could see + | what it thinks you said, and correct it. + p + | There are lots of problems to solve to make that work, but some sort + | of syntactic representation is definitely necessary. We need to know that: - p.example - Unless John’s school, call when I’m in a meeting + p.example + Unless John’s school calls, when I’m in a meeting, set volume to zero - p means something completely different. + p is another way of phrasing the first instruction, while: - p - | A dependency parser returns a graph of word-word relationships, - | intended to make such reasoning easier. Our graphs will be trees – - | edges will be directed, and every node (word) will have exactly one - | incoming arc (one dependency, with its head), except one. + p.example + Unless John’s school, call when I’m in a meeting - h4 Example usage + p means something completely different. - pre.language-python. + p + | A dependency parser returns a graph of word-word relationships, + | intended to make such reasoning easier. Our graphs will be trees – + | edges will be directed, and every node (word) will have exactly one + | incoming arc (one dependency, with its head), except one. - p. - The idea is that it should be slightly easier to reason from the parse, - than it was from the string. The parse-to-meaning mapping is hopefully - simpler than the string-to-meaning mapping. + h4 Example usage - p. - The most confusing thing about this problem area is that “correctness” - is defined by convention — by annotation guidelines. If you haven’t - read the guidelines and you’re not a linguist, you can’t tell whether - the parse is “wrong” or “right”, which makes the whole task feel weird - and artificial. - - p. - For instance, there’s a mistake in the parse above: “John’s school - calls” is structured wrongly, according to the Stanford annotation - guidelines. The structure of that part of the sentence is how the - annotators were instructed to parse an example like “John’s school - clothes”. - - p - | It’s worth dwelling on this point a bit. We could, in theory, have - | written our guidelines so that the “correct” parses were reversed. - | There’s good reason to believe the parsing task will be harder if we - | reversed our convention, as it’d be less consistent with the rest of - | the grammar. - sup: a(href='#note-2') [2] - | But we could test that empirically, and we’d be pleased to gain an - | advantage by reversing the policy. + pre.language-python + code + | parser = parser.Parser() + | tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split() + | >>> tags, heads = parser.parse(tokens) + | >>> heads + | [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11] + | >>> for i, h in enumerate(heads): + | ... head = tokens[heads[h]] if h >= 1 else 'None' + | ... print(tokens[i] + ' <-- ' + head]) + | Set <-- None + | the <-- volume + | volume <-- Set + | to <-- Set + | zero <-- to + | when <-- Set + | I <-- 'm + | 'm <-- when + | in <-- 'm + | a <-- meeting + | meeting <-- in + | unless <-- Set + | John <-- 's + | 's <-- calls + | school <-- calls + | calls <-- unless - p - | We definitely do want that distinction in the guidelines — we don’t - | want both to receive the same structure, or our output will be less - | useful. The annotation guidelines strike a balance between what - | distinctions downstream applications will find useful, and what - | parsers will be able to predict easily. + p. + The idea is that it should be slightly easier to reason from the parse, + than it was from the string. The parse-to-meaning mapping is hopefully + simpler than the string-to-meaning mapping. - h4 Projective trees - - p - | There’s a particularly useful simplification that we can make, when - | deciding what we want the graph to look like: we can restrict the - | graph structures we’ll be dealing with. This doesn’t just give us a - | likely advantage in learnability; it can have deep algorithmic - | implications. We follow most work on English in constraining the - | dependency graphs to be - em projective trees - | : - - ol - li Tree. Every word has exactly one head, except for the dummy ROOT symbol. - li - | Projective. For every pair of dependencies (a1, a2) and (b1, b2), - | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. - | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or - | b1 a1 b2 a2. - - p - | There’s a rich literature on parsing non-projective trees, and a - | smaller literature on parsing DAGs. But the parsing algorithm I’ll - | be explaining deals with projective trees. - - h3 Greedy transition-based parsing - - p - | Our parser takes as input a list of string tokens, and outputs a - | list of head indices, representing edges in the graph. If the - - em i - - | th member of heads is - - em j - - | , the dependency parse contains an edge (j, i). A transition-based - | parser is a finite-state transducer; it maps an array of N words - | onto an output array of N head indices: - - table.center - tbody - tr - td - em start - td MSNBC - td reported - td that - td Facebook - td bought - td WhatsApp - td for - td $16bn - td - em root - tr - td 0 - td 2 - td 9 - td 2 - td 4 - td 2 - td 4 - td 4 - td 7 - td 0 - p - | The heads array denotes that the head of - em MSNBC - | is - em reported - | : - em MSNBC - | is word 1, and - em reported - | is word 2, and - code.language-python heads[1] == 2 - | . You can already see why parsing a tree is handy — this data structure - | wouldn’t work if we had to output a DAG, where words may have multiple - | heads. - - p - | Although - code.language-python heads - | can be represented as an array, we’d actually like to maintain some - | alternate ways to access the parse, to make it easy and efficient to - | extract features. Our - - code.language-python Parse - | class looks like this: - - pre.language-python - code - | class Parse(object): - | def __init__(self, n): - | self.n = n - | self.heads = [None] * (n-1) - | self.lefts = [] - | self.rights = [] - | for i in range(n+1): - | self.lefts.append(DefaultList(0)) - | self.rights.append(DefaultList(0)) - | - | def add_arc(self, head, child): - | self.heads[child] = head - | if child < head: - | self.lefts[head].append(child) - | else: - | self.rights[head].append(child) - - p - | As well as the parse, we also have to keep track of where we’re up - | to in the sentence. We’ll do this with an index into the - code.language-python words - | array, and a stack, to which we’ll push words, before popping them - | once their head is set. So our state data structure is fundamentally: - - ul - li An index, i, into the list of tokens; - li The dependencies added so far, in Parse - li - | A stack, containing words that occurred before i, for which we’re - | yet to assign a head. - - p Each step of the parsing process applies one of three actions to the state: - - pre.language-python - code - | SHIFT = 0; RIGHT = 1; LEFT = 2 - | MOVES = [SHIFT, RIGHT, LEFT] - | - | def transition(move, i, stack, parse): - | global SHIFT, RIGHT, LEFT - | if move == SHIFT: - | stack.append(i) - | return i + 1 - | elif move == RIGHT: - | parse.add_arc(stack[-2], stack.pop()) - | return i - | elif move == LEFT: - | parse.add_arc(i, stack.pop()) - | return i - | raise GrammarError("Unknown move: %d" % move) - - - - p - | The - code.language-python LEFT - | and - code.language-python RIGHT - | actions add dependencies and pop the stack, while - code.language-python SHIFT - | pushes the stack and advances i into the buffer. - p. - So, the parser starts with an empty stack, and a buffer index at 0, with - no dependencies recorded. It chooses one of the (valid) actions, and - applies it to the state. It continues choosing actions and applying - them until the stack is empty and the buffer index is at the end of - the input. (It’s hard to understand this sort of algorithm without - stepping through it. Try coming up with a sentence, drawing a projective - parse tree over it, and then try to reach the parse tree by choosing - the right sequence of transitions.) - - p Here’s what the parsing loop looks like in code: - - pre.language-python - code - | class Parser(object): - | ... - | def parse(self, words): - | tags = self.tagger(words) - | n = len(words) - | idx = 1 - | stack = [0] - | deps = Parse(n) - | while stack or idx < n: - | features = extract_features(words, tags, idx, n, stack, deps) - | scores = self.model.score(features) - | valid_moves = get_valid_moves(i, n, len(stack)) - | next_move = max(valid_moves, key=lambda move: scores[move]) - | idx = transition(next_move, idx, stack, parse) - | return tags, parse - | - | def get_valid_moves(i, n, stack_depth): - | moves = [] - | if i < n: - | moves.append(SHIFT) - | if stack_depth <= 2: - | moves.append(RIGHT) - | if stack_depth <= 1: - | moves.append(LEFT) - | return moves - - p. - We start by tagging the sentence, and initializing the state. We then - map the state to a set of features, which we score using a linear model. - We then find the best-scoring valid move, and apply it to the state. - - p - | The model scoring works the same as it did in - a(href=urls.post) the POS tagger. - | If you’re confused about the idea of extracting features and scoring - | them with a linear model, you should review that post. Here’s a reminder - | of how the model scoring works: - - pre.language-python - code - | class Perceptron(object) - | ... - | def score(self, features): - | all_weights = self.weights - | scores = dict((clas, 0) for clas in self.classes) - | for feat, value in features.items(): - | if value == 0: - | continue - | if feat not in all_weights: - | continue - | weights = all_weights[feat] - | for clas, weight in weights.items(): - | scores[clas] += value * weight - | return scores - - p. - It’s just summing the class-weights for each feature. This is often - expressed as a dot-product, but when you’re dealing with multiple - classes, that gets awkward, I find. - - p. - The beam parser (RedShift) tracks multiple candidates, and only decides - on the best one at the very end. We’re going to trade away accuracy - in favour of efficiency and simplicity. We’ll only follow a single - analysis. Our search strategy will be entirely greedy, as it was with - the POS tagger. We’ll lock-in our choices at every step. - - p. - If you read the POS tagger post carefully, you might see the underlying - similarity. What we’ve done is mapped the parsing problem onto a - sequence-labelling problem, which we address using a “flat”, or unstructured, - learning algorithm (by doing greedy search). - - h3 Features - p. - Feature extraction code is always pretty ugly. The features for the parser - refer to a few tokens from the context: - - ul - li The first three words of the buffer (n0, n1, n2) - li The top three words of the stack (s0, s1, s2) - li The two leftmost children of s0 (s0b1, s0b2); - li The two rightmost children of s0 (s0f1, s0f2); - li The two leftmost children of n0 (n0b1, n0b2) - - p. - For these 12 tokens, we refer to the word-form, the part-of-speech tag, - and the number of left and right children attached to the token. - - p. - Because we’re using a linear model, we have our features refer to pairs - and triples of these atomic properties. - - pre.language-python - code - | def extract_features(words, tags, n0, n, stack, parse): - | def get_stack_context(depth, stack, data): - | if depth >= 3: - | return data[stack[-1]], data[stack[-2]], data[stack[-3]] - | elif depth >= 2: - | return data[stack[-1]], data[stack[-2]], '' - | elif depth == 1: - | return data[stack[-1]], '', '' - | else: - | return '', '', '' - | - | def get_buffer_context(i, n, data): - | if i + 1 >= n: - | return data[i], '', '' - | elif i + 2 >= n: - | return data[i], data[i + 1], '' - | else: - | return data[i], data[i + 1], data[i + 2] - | - | def get_parse_context(word, deps, data): - | if word == -1: - | return 0, '', '' - | deps = deps[word] - | valency = len(deps) - | if not valency: - | return 0, '', '' - | elif valency == 1: - | return 1, data[deps[-1]], '' - | else: - | return valency, data[deps[-1]], data[deps[-2]] - | - | features = {} - | # Set up the context pieces --- the word, W, and tag, T, of: - | # S0-2: Top three words on the stack - | # N0-2: First three words of the buffer - | # n0b1, n0b2: Two leftmost children of the first word of the buffer - | # s0b1, s0b2: Two leftmost children of the top word of the stack - | # s0f1, s0f2: Two rightmost children of the top word of the stack - | - | depth = len(stack) - | s0 = stack[-1] if depth else -1 - | - | Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) - | Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) - | - | Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) - | Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) - | - | Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) - | Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) - | - | Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) - | _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) - | - | Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) - | _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) - | - | Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) - | _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) - | - | # Cap numeric features at 5? - | # String-distance - | Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 - | - | features['bias'] = 1 - | # Add word and tag unigrams - | for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): - | if w: - | features['w=%s' % w] = 1 - | for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): - | if t: - | features['t=%s' % t] = 1 - | - | # Add word/tag pairs - | for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): - | if w or t: - | features['%d w=%s, t=%s' % (i, w, t)] = 1 - | - | # Add some bigrams - | features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 - | features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 - | features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 - | features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 - | features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 - | features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 - | features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 - | features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 - | - | # Add some tag trigrams - | trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), - | (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), - | (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), - | (Ts0, Ts1, Ts1)) - | for i, (t1, t2, t3) in enumerate(trigrams): - | if t1 or t2 or t3: - | features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 - | - | # Add some valency and distance features - | vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) - | vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) - | d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), - | ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) - | for i, (w_t, v_d) in enumerate(vw + vt + d): - | if w_t or v_d: - | features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 - | return features + p. + The most confusing thing about this problem area is that “correctness” + is defined by convention — by annotation guidelines. If you haven’t + read the guidelines and you’re not a linguist, you can’t tell whether + the parse is “wrong” or “right”, which makes the whole task feel weird + and artificial. - - h3 Training - - p. - Weights are learned using the same algorithm, averaged perceptron, that - we used for part-of-speech tagging. Its key strength is that it’s an - online learning algorithm: examples stream in one-by-one, we make our - prediction, check the actual answer, and adjust our beliefs (weights) - if we were wrong. - - p The training loop looks like this: - - pre.language-python - code - | class Parser(object): - | ... - | def train_one(self, itn, words, gold_tags, gold_heads): - | n = len(words) - | i = 2; stack = [1]; parse = Parse(n) - | tags = self.tagger.tag(words) - | while stack or (i + 1) < n: - | features = extract_features(words, tags, i, n, stack, parse) - | scores = self.model.score(features) - | valid_moves = get_valid_moves(i, n, len(stack)) - | guess = max(valid_moves, key=lambda move: scores[move]) - | gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) - | best = max(gold_moves, key=lambda move: scores[move]) - | self.model.update(best, guess, features) - | i = transition(guess, i, stack, parse) - | # Return number correct - | return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) + p. + For instance, there’s a mistake in the parse above: “John’s school + calls” is structured wrongly, according to the Stanford annotation + guidelines. The structure of that part of the sentence is how the + annotators were instructed to parse an example like “John’s school + clothes”. + p + | It’s worth dwelling on this point a bit. We could, in theory, have + | written our guidelines so that the “correct” parses were reversed. + | There’s good reason to believe the parsing task will be harder if we + | reversed our convention, as it’d be less consistent with the rest of + | the grammar. + sup: a(href='#note-2') [2] + | But we could test that empirically, and we’d be pleased to gain an + | advantage by reversing the policy. - - p. - The most interesting part of the training process is in - code.language-python get_gold_moves. - The performance of our parser is made possible by an advance by Goldberg - and Nivre (2012), who showed that we’d been doing this wrong for years. + p + | We definitely do want that distinction in the guidelines — we don’t + | want both to receive the same structure, or our output will be less + | useful. The annotation guidelines strike a balance between what + | distinctions downstream applications will find useful, and what + | parsers will be able to predict easily. + + h4 Projective trees + + p + | There’s a particularly useful simplification that we can make, when + | deciding what we want the graph to look like: we can restrict the + | graph structures we’ll be dealing with. This doesn’t just give us a + | likely advantage in learnability; it can have deep algorithmic + | implications. We follow most work on English in constraining the + | dependency graphs to be + em projective trees + | : + + ol + li Tree. Every word has exactly one head, except for the dummy ROOT symbol. + li + | Projective. For every pair of dependencies (a1, a2) and (b1, b2), + | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. + | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or + | b1 a1 b2 a2. + + p + | There’s a rich literature on parsing non-projective trees, and a + | smaller literature on parsing DAGs. But the parsing algorithm I’ll + | be explaining deals with projective trees. + + h3 Greedy transition-based parsing + + p + | Our parser takes as input a list of string tokens, and outputs a + | list of head indices, representing edges in the graph. If the + + em i + + | th member of heads is + + em j + + | , the dependency parse contains an edge (j, i). A transition-based + | parser is a finite-state transducer; it maps an array of N words + | onto an output array of N head indices: + + table.center + tbody + tr + td + em start + td MSNBC + td reported + td that + td Facebook + td bought + td WhatsApp + td for + td $16bn + td + em root + tr + td 0 + td 2 + td 9 + td 2 + td 4 + td 2 + td 4 + td 4 + td 7 + td 0 + p + | The heads array denotes that the head of + em MSNBC + | is + em reported + | : + em MSNBC + | is word 1, and + em reported + | is word 2, and + code.language-python heads[1] == 2 + | . You can already see why parsing a tree is handy — this data structure + | wouldn’t work if we had to output a DAG, where words may have multiple + | heads. + + p + | Although + code.language-python heads + | can be represented as an array, we’d actually like to maintain some + | alternate ways to access the parse, to make it easy and efficient to + | extract features. Our + + code.language-python Parse + | class looks like this: + + pre.language-python + code + | class Parse(object): + | def __init__(self, n): + | self.n = n + | self.heads = [None] * (n-1) + | self.lefts = [] + | self.rights = [] + | for i in range(n+1): + | self.lefts.append(DefaultList(0)) + | self.rights.append(DefaultList(0)) + | + | def add_arc(self, head, child): + | self.heads[child] = head + | if child < head: + | self.lefts[head].append(child) + | else: + | self.rights[head].append(child) + + p + | As well as the parse, we also have to keep track of where we’re up + | to in the sentence. We’ll do this with an index into the + code.language-python words + | array, and a stack, to which we’ll push words, before popping them + | once their head is set. So our state data structure is fundamentally: + + ul + li An index, i, into the list of tokens; + li The dependencies added so far, in Parse + li + | A stack, containing words that occurred before i, for which we’re + | yet to assign a head. + + p Each step of the parsing process applies one of three actions to the state: + + pre.language-python + code + | SHIFT = 0; RIGHT = 1; LEFT = 2 + | MOVES = [SHIFT, RIGHT, LEFT] + | + | def transition(move, i, stack, parse): + | global SHIFT, RIGHT, LEFT + | if move == SHIFT: + | stack.append(i) + | return i + 1 + | elif move == RIGHT: + | parse.add_arc(stack[-2], stack.pop()) + | return i + | elif move == LEFT: + | parse.add_arc(i, stack.pop()) + | return i + | raise GrammarError("Unknown move: %d" % move) + + + + p + | The + code.language-python LEFT + | and + code.language-python RIGHT + | actions add dependencies and pop the stack, while + code.language-python SHIFT + | pushes the stack and advances i into the buffer. + p. + So, the parser starts with an empty stack, and a buffer index at 0, with + no dependencies recorded. It chooses one of the (valid) actions, and + applies it to the state. It continues choosing actions and applying + them until the stack is empty and the buffer index is at the end of + the input. (It’s hard to understand this sort of algorithm without + stepping through it. Try coming up with a sentence, drawing a projective + parse tree over it, and then try to reach the parse tree by choosing + the right sequence of transitions.) + + p Here’s what the parsing loop looks like in code: + + pre.language-python + code + | class Parser(object): + | ... + | def parse(self, words): + | tags = self.tagger(words) + | n = len(words) + | idx = 1 + | stack = [0] + | deps = Parse(n) + | while stack or idx < n: + | features = extract_features(words, tags, idx, n, stack, deps) + | scores = self.model.score(features) + | valid_moves = get_valid_moves(i, n, len(stack)) + | next_move = max(valid_moves, key=lambda move: scores[move]) + | idx = transition(next_move, idx, stack, parse) + | return tags, parse + | + | def get_valid_moves(i, n, stack_depth): + | moves = [] + | if i < n: + | moves.append(SHIFT) + | if stack_depth <= 2: + | moves.append(RIGHT) + | if stack_depth <= 1: + | moves.append(LEFT) + | return moves + + p. + We start by tagging the sentence, and initializing the state. We then + map the state to a set of features, which we score using a linear model. + We then find the best-scoring valid move, and apply it to the state. + + p + | The model scoring works the same as it did in + a(href=urls.post) the POS tagger. + | If you’re confused about the idea of extracting features and scoring + | them with a linear model, you should review that post. Here’s a reminder + | of how the model scoring works: + + pre.language-python + code + | class Perceptron(object) + | ... + | def score(self, features): + | all_weights = self.weights + | scores = dict((clas, 0) for clas in self.classes) + | for feat, value in features.items(): + | if value == 0: + | continue + | if feat not in all_weights: + | continue + | weights = all_weights[feat] + | for clas, weight in weights.items(): + | scores[clas] += value * weight + | return scores + + p. + It’s just summing the class-weights for each feature. This is often + expressed as a dot-product, but when you’re dealing with multiple + classes, that gets awkward, I find. + + p. + The beam parser (RedShift) tracks multiple candidates, and only decides + on the best one at the very end. We’re going to trade away accuracy + in favour of efficiency and simplicity. We’ll only follow a single + analysis. Our search strategy will be entirely greedy, as it was with + the POS tagger. We’ll lock-in our choices at every step. + + p. + If you read the POS tagger post carefully, you might see the underlying + similarity. What we’ve done is mapped the parsing problem onto a + sequence-labelling problem, which we address using a “flat”, or unstructured, + learning algorithm (by doing greedy search). + + h3 Features + p. + Feature extraction code is always pretty ugly. The features for the parser + refer to a few tokens from the context: + + ul + li The first three words of the buffer (n0, n1, n2) + li The top three words of the stack (s0, s1, s2) + li The two leftmost children of s0 (s0b1, s0b2); + li The two rightmost children of s0 (s0f1, s0f2); + li The two leftmost children of n0 (n0b1, n0b2) + + p. + For these 12 tokens, we refer to the word-form, the part-of-speech tag, + and the number of left and right children attached to the token. + + p. + Because we’re using a linear model, we have our features refer to pairs + and triples of these atomic properties. + + pre.language-python + code + | def extract_features(words, tags, n0, n, stack, parse): + | def get_stack_context(depth, stack, data): + | if depth >= 3: + | return data[stack[-1]], data[stack[-2]], data[stack[-3]] + | elif depth >= 2: + | return data[stack[-1]], data[stack[-2]], '' + | elif depth == 1: + | return data[stack[-1]], '', '' + | else: + | return '', '', '' + | + | def get_buffer_context(i, n, data): + | if i + 1 >= n: + | return data[i], '', '' + | elif i + 2 >= n: + | return data[i], data[i + 1], '' + | else: + | return data[i], data[i + 1], data[i + 2] + | + | def get_parse_context(word, deps, data): + | if word == -1: + | return 0, '', '' + | deps = deps[word] + | valency = len(deps) + | if not valency: + | return 0, '', '' + | elif valency == 1: + | return 1, data[deps[-1]], '' + | else: + | return valency, data[deps[-1]], data[deps[-2]] + | + | features = {} + | # Set up the context pieces --- the word, W, and tag, T, of: + | # S0-2: Top three words on the stack + | # N0-2: First three words of the buffer + | # n0b1, n0b2: Two leftmost children of the first word of the buffer + | # s0b1, s0b2: Two leftmost children of the top word of the stack + | # s0f1, s0f2: Two rightmost children of the top word of the stack + | + | depth = len(stack) + | s0 = stack[-1] if depth else -1 + | + | Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) + | Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) + | + | Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) + | Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) + | + | Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) + | Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) + | + | Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) + | _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) + | + | Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) + | _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) + | + | Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) + | _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) + | + | # Cap numeric features at 5? + | # String-distance + | Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 + | + | features['bias'] = 1 + | # Add word and tag unigrams + | for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): + | if w: + | features['w=%s' % w] = 1 + | for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): + | if t: + | features['t=%s' % t] = 1 + | + | # Add word/tag pairs + | for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): + | if w or t: + | features['%d w=%s, t=%s' % (i, w, t)] = 1 + | + | # Add some bigrams + | features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 + | features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 + | features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 + | features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 + | features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 + | features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 + | features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 + | features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 + | + | # Add some tag trigrams + | trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), + | (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), + | (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), + | (Ts0, Ts1, Ts1)) + | for i, (t1, t2, t3) in enumerate(trigrams): + | if t1 or t2 or t3: + | features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 + | + | # Add some valency and distance features + | vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) + | vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) + | d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), + | ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) + | for i, (w_t, v_d) in enumerate(vw + vt + d): + | if w_t or v_d: + | features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 + | return features + + + h3 Training + + p. + Weights are learned using the same algorithm, averaged perceptron, that + we used for part-of-speech tagging. Its key strength is that it’s an + online learning algorithm: examples stream in one-by-one, we make our + prediction, check the actual answer, and adjust our beliefs (weights) + if we were wrong. - p - | In the POS-tagging post, I cautioned that during training you need to - | make sure you pass in the last two - em predicted - | tags as features for the current tag, not the last two - em gold - | tags. At test time you’ll only have the predicted tags, so if you - | base your features on the gold sequence during training, your training - | contexts won’t resemble your test-time contexts, so you’ll learn the - | wrong weights. + p The training loop looks like this: - p. - In parsing, the problem was that we didn’t know - em how - | to pass in the predicted sequence! Training worked by taking the - | gold-standard tree, and finding a transition sequence that led to it. - | i.e., you got back a sequence of moves, with the guarantee that if - | you followed those moves, you’d get the gold-standard dependencies. - - p - | The problem is, we didn’t know how to define the “correct” move to - | teach a parser to make if it was in any state that - em wasn’t - | along that gold-standard sequence. Once the parser had made a mistake, - | we didn’t know how to train from that example. + pre.language-python + code + | class Parser(object): + | ... + | def train_one(self, itn, words, gold_tags, gold_heads): + | n = len(words) + | i = 2; stack = [1]; parse = Parse(n) + | tags = self.tagger.tag(words) + | while stack or (i + 1) < n: + | features = extract_features(words, tags, i, n, stack, parse) + | scores = self.model.score(features) + | valid_moves = get_valid_moves(i, n, len(stack)) + | guess = max(valid_moves, key=lambda move: scores[move]) + | gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) + | best = max(gold_moves, key=lambda move: scores[move]) + | self.model.update(best, guess, features) + | i = transition(guess, i, stack, parse) + | # Return number correct + | return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) - p - | That was a big problem, because it meant that once the parser started - | making mistakes, it would end up in states unlike any in its training - | data – leading to yet more mistakes. The problem was specific - | to greedy parsers: once you use a beam, there’s a natural way to do - | structured prediction. - p - | The solution seems obvious once you know it, like all the best breakthroughs. - | What we do is define a function that asks “How many gold-standard - | dependencies can be recovered from this state?”. If you can define - | that function, then you can apply each move in turn, and ask, “How - | many gold-standard dependencies can be recovered from - em this - | state?”. If the action you applied allows - em fewer - | gold-standard dependencies to be reached, then it is sub-optimal. - p That’s a lot to take in. + + p + | The most interesting part of the training process is in + code.language-python get_gold_moves. + | The performance of our parser is made possible by an advance by Goldberg + | and Nivre (2012), who showed that we’d been doing this wrong for years. + + p + | In the POS-tagging post, I cautioned that during training you need to + | make sure you pass in the last two + em predicted + | tags as features for the current tag, not the last two + em gold + | tags. At test time you’ll only have the predicted tags, so if you + | base your features on the gold sequence during training, your training + | contexts won’t resemble your test-time contexts, so you’ll learn the + | wrong weights. - p - | So we have this function - code Oracle(state) - | : - pre - code - | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | - p - | We also have a set of actions, each of which returns a new state. - | We want to know: + p. + In parsing, the problem was that we didn’t know + em how + | to pass in the predicted sequence! Training worked by taking the + | gold-standard tree, and finding a transition sequence that led to it. + | i.e., you got back a sequence of moves, with the guarantee that if + | you followed those moves, you’d get the gold-standard dependencies. + + p + | The problem is, we didn’t know how to define the “correct” move to + | teach a parser to make if it was in any state that + em wasn’t + | along that gold-standard sequence. Once the parser had made a mistake, + | we didn’t know how to train from that example. - ul - li shift_cost = Oracle(state) – Oracle(shift(state)) - li right_cost = Oracle(state) – Oracle(right(state)) - li left_cost = Oracle(state) – Oracle(left(state)) - - p - | Now, at least one of those costs - em has - | to be zero. Oracle(state) is asking, “what’s the cost of the best - | path forward?”, and the first action of that best path has to be - | shift, right, or left. + p + | That was a big problem, because it meant that once the parser started + | making mistakes, it would end up in states unlike any in its training + | data – leading to yet more mistakes. The problem was specific + | to greedy parsers: once you use a beam, there’s a natural way to do + | structured prediction. + p + | The solution seems obvious once you know it, like all the best breakthroughs. + | What we do is define a function that asks “How many gold-standard + | dependencies can be recovered from this state?”. If you can define + | that function, then you can apply each move in turn, and ask, “How + | many gold-standard dependencies can be recovered from + em this + | state?”. If the action you applied allows + em fewer + | gold-standard dependencies to be reached, then it is sub-optimal. - p - | It turns out that we can derive Oracle fairly simply for many transition - | systems. The derivation for the transition system we’re using, Arc - | Hybrid, is in Goldberg and Nivre (2013). + p That’s a lot to take in. - p - | We’re going to implement the oracle as a function that returns the - | zero-cost moves, rather than implementing a function Oracle(state). - | This prevents us from doing a bunch of costly copy operations. - | Hopefully the reasoning in the code isn’t too hard to follow, but - | you can also consult Goldberg and Nivre’s papers if you’re confused - | and want to get to the bottom of this. + p + | So we have this function + code Oracle(state) + | : + pre + code + | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | + p + | We also have a set of actions, each of which returns a new state. + | We want to know: - pre.language-python - code - | def get_gold_moves(n0, n, stack, heads, gold): - | def deps_between(target, others, gold): - | for word in others: - | if gold[word] == target or gold[target] == word: - | return True - | return False - | - | valid = get_valid_moves(n0, n, len(stack)) - | if not stack or (SHIFT in valid and gold[n0] == stack[-1]): - | return [SHIFT] - | if gold[stack[-1]] == n0: - | return [LEFT] - | costly = set([m for m in MOVES if m not in valid]) - | # If the word behind s0 is its gold head, Left is incorrect - | if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: - | costly.add(LEFT) - | # If there are any dependencies between n0 and the stack, - | # pushing n0 will lose them. - | if SHIFT not in costly and deps_between(n0, stack, gold): - | costly.add(SHIFT) - | # If there are any dependencies between s0 and the buffer, popping - | # s0 will lose them. - | if deps_between(stack[-1], range(n0+1, n-1), gold): - | costly.add(LEFT) - | costly.add(RIGHT) - | return [m for m in MOVES if m not in costly] + ul + li shift_cost = Oracle(state) – Oracle(shift(state)) + li right_cost = Oracle(state) – Oracle(right(state)) + li left_cost = Oracle(state) – Oracle(left(state)) + + p + | Now, at least one of those costs + em has + | to be zero. Oracle(state) is asking, “what’s the cost of the best + | path forward?”, and the first action of that best path has to be + | shift, right, or left. + + p + | It turns out that we can derive Oracle fairly simply for many transition + | systems. The derivation for the transition system we’re using, Arc + | Hybrid, is in Goldberg and Nivre (2013). + + p + | We’re going to implement the oracle as a function that returns the + | zero-cost moves, rather than implementing a function Oracle(state). + | This prevents us from doing a bunch of costly copy operations. + | Hopefully the reasoning in the code isn’t too hard to follow, but + | you can also consult Goldberg and Nivre’s papers if you’re confused + | and want to get to the bottom of this. + + pre.language-python + code + | def get_gold_moves(n0, n, stack, heads, gold): + | def deps_between(target, others, gold): + | for word in others: + | if gold[word] == target or gold[target] == word: + | return True + | return False + | + | valid = get_valid_moves(n0, n, len(stack)) + | if not stack or (SHIFT in valid and gold[n0] == stack[-1]): + | return [SHIFT] + | if gold[stack[-1]] == n0: + | return [LEFT] + | costly = set([m for m in MOVES if m not in valid]) + | # If the word behind s0 is its gold head, Left is incorrect + | if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: + | costly.add(LEFT) + | # If there are any dependencies between n0 and the stack, + | # pushing n0 will lose them. + | if SHIFT not in costly and deps_between(n0, stack, gold): + | costly.add(SHIFT) + | # If there are any dependencies between s0 and the buffer, popping + | # s0 will lose them. + | if deps_between(stack[-1], range(n0+1, n-1), gold): + | costly.add(LEFT) + | costly.add(RIGHT) + | return [m for m in MOVES if m not in costly] - p - | Doing this “dynamic oracle” training procedure makes a big difference - | to accuracy — typically 1-2%, with no difference to the way the run-time - | works. The old “static oracle” greedy training procedure is fully - | obsolete; there’s no reason to do it that way any more. + p + | Doing this “dynamic oracle” training procedure makes a big difference + | to accuracy — typically 1-2%, with no difference to the way the run-time + | works. The old “static oracle” greedy training procedure is fully + | obsolete; there’s no reason to do it that way any more. - h3 Conclusion + h3 Conclusion - p - | I have the sense that language technologies, particularly those relating - | to grammar, are particularly mysterious. I can imagine having no idea - | what the program might even do. + p + | I have the sense that language technologies, particularly those relating + | to grammar, are particularly mysterious. I can imagine having no idea + | what the program might even do. - p - | I think it therefore seems natural to people that the best solutions - | would be over-whelmingly complicated. A 200,000 line Java package - | feels appropriate. - p - | But, algorithmic code is usually short, when only a single algorithm - | is implemented. And when you only implement one algorithm, and you - | know exactly what you want to write before you write a line, you - | also don’t pay for any unnecessary abstractions, which can have a - | big performance impact. + p + | I think it therefore seems natural to people that the best solutions + | would be over-whelmingly complicated. A 200,000 line Java package + | feels appropriate. + p + | But, algorithmic code is usually short, when only a single algorithm + | is implemented. And when you only implement one algorithm, and you + | know exactly what you want to write before you write a line, you + | also don’t pay for any unnecessary abstractions, which can have a + | big performance impact. - h3 Notes - p - a(name='note-1') - | [1] I wasn’t really sure how to count the lines of code in the Stanford - | parser. Its jar file ships over 200k, but there are a lot of different - | models in it. It’s not important, but it's certainly over 4k. + h3 Notes + p + a(name='note-1') + | [1] I wasn’t really sure how to count the lines of code in the Stanford + | parser. Its jar file ships over 200k, but there are a lot of different + | models in it. It’s not important, but it's certainly over 4k. - p - a(name='note-2') - | [2] For instance, how would you parse, “John’s school of music calls”? - | You want to make sure the phrase “John’s school” has a consistent - | structure in both “John’s school calls” and “John’s school of music - | calls”. Reasoning about the different “slots” you can put a phrase - | into is a key way we reason about what syntactic analyses look like. - | You can think of each phrase as having a different shaped connector, - | which you need to plug into different slots — which each phrase also - | has a certain number of, each of a different shape. We’re trying to - | figure out what connectors are where, so we can figure out how the - | sentences are put together. + p + a(name='note-2') + | [2] For instance, how would you parse, “John’s school of music calls”? + | You want to make sure the phrase “John’s school” has a consistent + | structure in both “John’s school calls” and “John’s school of music + | calls”. Reasoning about the different “slots” you can put a phrase + | into is a key way we reason about what syntactic analyses look like. + | You can think of each phrase as having a different shaped connector, + | which you need to plug into different slots — which each phrase also + | has a certain number of, each of a different shape. We’re trying to + | figure out what connectors are where, so we can figure out how the + | sentences are put together. - h3 Idle speculation - p - | For a long time, incremental language processing algorithms were - | primarily of scientific interest. If you want to write a parser to - | test a theory about how the human sentence processor might work, well, - | that parser needs to build partial interpretations. There’s a wealth - | of evidence, including commonsense introspection, that establishes - | that we don’t buffer input and analyse it once the speaker has finished. + h3 Idle speculation + p + | For a long time, incremental language processing algorithms were + | primarily of scientific interest. If you want to write a parser to + | test a theory about how the human sentence processor might work, well, + | that parser needs to build partial interpretations. There’s a wealth + | of evidence, including commonsense introspection, that establishes + | that we don’t buffer input and analyse it once the speaker has finished. - p - | But now algorithms with that neat scientific feature are winning! - | As best as I can tell, the secret to that success is to be: + p + | But now algorithms with that neat scientific feature are winning! + | As best as I can tell, the secret to that success is to be: - ul - li Incremental. Earlier words constrain the search. - li - | Error-driven. Training involves a working hypothesis, which is - | updated as it makes mistakes. + ul + li Incremental. Earlier words constrain the search. + li + | Error-driven. Training involves a working hypothesis, which is + | updated as it makes mistakes. - p - | The links to human sentence processing seem tantalising. I look - | forward to seeing whether these engineering breakthroughs lead to - | any psycholinguistic advances. + p + | The links to human sentence processing seem tantalising. I look + | forward to seeing whether these engineering breakthroughs lead to + | any psycholinguistic advances. - h3 Bibliography + h3 Bibliography - p - | The NLP literature is almost entirely open access. All of the relavant - | papers can be found - a(href=urls.acl_anthology, rel='nofollow') here - | . - p - | The parser I’ve described is an implementation of the dynamic-oracle - | Arc-Hybrid system here: + p + | The NLP literature is almost entirely open access. All of the relavant + | papers can be found + a(href=urls.acl_anthology, rel='nofollow') here + | . + p + | The parser I’ve described is an implementation of the dynamic-oracle + | Arc-Hybrid system here: - span.bib-item - | Goldberg, Yoav; Nivre, Joakim. - em Training Deterministic Parsers with Non-Deterministic Oracles - | . TACL 2013 - p - | However, I wrote my own features for it. The arc-hybrid system was - | originally described here: + span.bib-item + | Goldberg, Yoav; Nivre, Joakim. + em Training Deterministic Parsers with Non-Deterministic Oracles + | . TACL 2013 + p + | However, I wrote my own features for it. The arc-hybrid system was + | originally described here: - span.bib-item - | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic - | programming algorithms for transition-based dependency parsers. ACL 2011 + span.bib-item + | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic + | programming algorithms for transition-based dependency parsers. ACL 2011 - p - | The dynamic oracle training method was first described here: - span.bib-item - | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; - | Nivre, Joakim. COLING 2012 + p + | The dynamic oracle training method was first described here: + span.bib-item + | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; + | Nivre, Joakim. COLING 2012 - p - | This work depended on a big break-through in accuracy for transition-based - | parsers, when beam-search was properly explored by Zhang and Clark. - | They have several papers, but the preferred citation is: + p + | This work depended on a big break-through in accuracy for transition-based + | parsers, when beam-search was properly explored by Zhang and Clark. + | They have several papers, but the preferred citation is: - span.bib-item - | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized - | Perceptron and Beam Search. Computational Linguistics 2011 (1) - p - | Another important paper was this little feature engineering paper, - | which further improved the accuracy: + span.bib-item + | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized + | Perceptron and Beam Search. Computational Linguistics 2011 (1) + p + | Another important paper was this little feature engineering paper, + | which further improved the accuracy: - span.bib-item - | Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with - | Rich Non-local Features. ACL 2011 + span.bib-item + | Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with + | Rich Non-local Features. ACL 2011 - p - | The generalised perceptron, which is the learning framework for these - | beam parsers, is from this paper: - span.bib-item - | Collins, Michael. Discriminative Training Methods for Hidden Markov - | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 + p + | The generalised perceptron, which is the learning framework for these + | beam parsers, is from this paper: + span.bib-item + | Collins, Michael. Discriminative Training Methods for Hidden Markov + | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 - h3 Experimental details - p - | The results at the start of the post refer to Section 22 of the Wall - | Street Journal corpus. The Stanford parser was run as follows: + h3 Experimental details + p + | The results at the start of the post refer to Section 22 of the Wall + | Street Journal corpus. The Stanford parser was run as follows: - pre.language-bash - code - | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ - | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* + pre.language-bash + code + | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ + | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* - p - | A small post-process was applied, to undo the fancy tokenisation - | Stanford adds for numbers, to make them match the PTB tokenisation: + p + | A small post-process was applied, to undo the fancy tokenisation + | Stanford adds for numbers, to make them match the PTB tokenisation: - pre.language-python - code - | """Stanford parser retokenises numbers. Split them.""" - | import sys - | import re - | - | qp_re = re.compile('\xc2\xa0') - | for line in sys.stdin: - | line = line.rstrip() - | if qp_re.search(line): - | line = line.replace('(CD', '(QP (CD', 1) + ')' - | line = line.replace('\xc2\xa0', ') (CD ') - | print line + pre.language-python + code + | """Stanford parser retokenises numbers. Split them.""" + | import sys + | import re + | + | qp_re = re.compile('\xc2\xa0') + | for line in sys.stdin: + | line = line.rstrip() + | if qp_re.search(line): + | line = line.replace('(CD', '(QP (CD', 1) + ')' + | line = line.replace('\xc2\xa0', ') (CD ') + | print line - p - | The resulting PTB-format files were then converted into dependencies - | using the Stanford converter: + p + | The resulting PTB-format files were then converted into dependencies + | using the Stanford converter: - pre.language-bash - code - | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp - | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ - | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll - p - | I can’t easily read that anymore, but it should just convert every - | .mrg file in a folder to a CoNLL-format Stanford basic dependencies - | file, using the settings common in the dependency literature. + pre.language-bash + code + | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp + | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ + | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll + p + | I can’t easily read that anymore, but it should just convert every + | .mrg file in a folder to a CoNLL-format Stanford basic dependencies + | file, using the settings common in the dependency literature. - p - | I then converted the gold-standard trees from WSJ 22, for the evaluation. - | Accuracy scores refer to unlabelled attachment score (i.e. the head index) - | of all non-punctuation tokens. + p + | I then converted the gold-standard trees from WSJ 22, for the evaluation. + | Accuracy scores refer to unlabelled attachment score (i.e. the head index) + | of all non-punctuation tokens. - p - | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 - | into the same conversion script. + p + | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 + | into the same conversion script. - p - | In a nutshell: The Stanford model and parser.py are trained on the - | same set of sentences, and they each make their predictions on a - | held-out test set, for which we know the answers. Accuracy refers - | to how many of the words’ heads we got correct. + p + | In a nutshell: The Stanford model and parser.py are trained on the + | same set of sentences, and they each make their predictions on a + | held-out test set, for which we know the answers. Accuracy refers + | to how many of the words’ heads we got correct. - p - | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a - | server, to give the Stanford parser more memory. The parser.py system - | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; - | CPython was about half as fast on an early benchmark. + p + | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a + | server, to give the Stanford parser more memory. The parser.py system + | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; + | CPython was about half as fast on an early benchmark. - p - | One of the reasons parser.py is so fast is that it does unlabelled - | parsing. Based on previous experiments, a labelled parser would likely - | be about 40x slower, and about 1% more accurate. Adapting the program - | to labelled parsing would be a good exercise for the reader, if you - | have access to the data. + p + | One of the reasons parser.py is so fast is that it does unlabelled + | parsing. Based on previous experiments, a labelled parser would likely + | be about 40x slower, and about 1% more accurate. Adapting the program + | to labelled parsing would be a good exercise for the reader, if you + | have access to the data. - p - | The result from the Redshift parser was produced from commit - code.language-python b6b624c9900f3bf - | , which was run as follows: - pre.language-python. - footer.meta(role='contentinfo') - a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter - .discuss - a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News - a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit - footer(role='contentinfo') - script(src='js/prism.js') + p + | The result from the Redshift parser was produced from commit + code.language-python b6b624c9900f3bf + | , which was run as follows: + pre.language-bash + code + | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp + | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ + | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll< + footer.meta(role='contentinfo') + a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter + .discuss + a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News + | + a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit diff --git a/docs/redesign/change_log.jade b/docs/redesign/change_log.jade new file mode 100644 index 000000000..e69de29bb diff --git a/docs/redesign/comparisons.jade b/docs/redesign/comparisons.jade new file mode 100644 index 000000000..a80df8235 --- /dev/null +++ b/docs/redesign/comparisons.jade @@ -0,0 +1,78 @@ ++comparison("NLTK") +//+comparison("Pattern") ++comparison("CoreNLP") ++comparison("ClearNLP") +//+comparison("OpenNLP") +//+comparison("GATE") + ++comparison("Accuracy Summary") + ++comparison("Speed Summary") + table + thead + tr + th. + th(colspan=3) Absolute (ms per doc) + th(colspan=3) Relative (to spaCy) + + tbody + tr + td: strong System + td: strong Split + td: strong Tag + td: strong Parse + td: strong Split + td: strong Tag + td: strong Parse + + +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") + +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") + +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") + +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") + +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") + + p + | Set up: 100,000 plain-text documents were streamed + | from an SQLite3 database, and processed with an NLP library, to one + | of three levels of detail – tokenization, tagging, or parsing. + | The tasks are additive: to parse the text you have to tokenize and + | tag it. The pre-processing was not subtracted from the times – + | I report the time required for the pipeline to complete. I report + | mean times per document, in milliseconds. + + p + | Hardware: Intel i7-3770 (2012) + + + +comparison("Independent Evaluation") + p + | Independent evaluation by Yahoo! Labs and Emory + | University, to appear at ACL 2015. Higher is better. + + table + thead + +columns("System", "Language", "Accuracy", "Speed") + + tbody + +row("spaCy v0.86", "Cython", "91.9", "13,963") + +row("spaCy v0.84", "Cython", "90.6", "13,963") + +row("ClearNLP", "Java", "91.7", "10,271") + +row("CoreNLP", "Java", "89.6", "8,602") + +row("MATE", "Java", "92.5", "550") + +row("Turbo", "C++", "92.4", "349") + +row("Yara", "Java", "92.3", "340") + + p + | Accuracy is % unlabelled arcs correct, speed is tokens per second. + + p + | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) + | performed a detailed comparison of the best parsers available. + | All numbers above are taken from the pre-print they kindly made + | available to me, except for spaCy v0.86. + + p + | I'm particularly grateful to the authors for discussion of their + | results, which led to the improvement in accuracy between v0.84 and + | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly + | useful. diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade index 29f0512e7..2b5c88760 100644 --- a/docs/redesign/docs.jade +++ b/docs/redesign/docs.jade @@ -1,15 +1,6 @@ -- var py_docs = 'unicode', - 'bool': py_docs + 'functions.html#bool">bool', - 'int': py_docs + 'functions.html#int">int', - 'generator': "", - 'Vocab': "", - 'Span': "", - 'Doc': "" - } +include ./mixins.jade mixin declare_class(name) @@ -107,599 +98,32 @@ mixin en_example | doc = nlp(unprocessed_unicode) -doctype html -html(lang="en") - head - meta(charset="utf-8") - title spaCy – Industrial-strength NLP - meta(name="description" content="") - meta(name="author" content="Matthew Honnibal") - link(rel="stylesheet" href="css/style.css") - - - body(id="docs") - header(role="banner") - h1.logo spaCy – Industrial-strength NLP - div.slogan API - +block intro_block + section(class="intro") nav(role="navigation") ul - li: a(href="#") Home - li.active: a(href="#") Docs - li: a(href="#") License - li: a(href="#") Blog + li: a(href="#api" class="button") API + li: a(href="#tutorials" class="button") Tutorials + li: a(href="#spec" class="button") Spec - main.docs#content - article - +declare_class("English") - p Load models into a callable object to process English text. +block body_block + - var py_docs = 'unicode', + 'bool': py_docs + 'functions.html#bool">bool', + 'int': py_docs + 'functions.html#int">int', + 'generator': "", + 'Vocab': "", + 'Span': "", + 'Doc': "" + } - +init - p - | Load the resources. Loading takes 20 seconds, and the instance - | consumes 2 to 3 gigabytes of memory. - - p - | Intended use is for one instance to be created per process. - | You can create more if you're doing something unusual. - p - | You may wish to make the instance a global variable or "singleton". - | We usually instantiate the object in the main() - | function and pass it around as an explicit argument. - +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") + article - +params - +param("data_dir") - | The data directory. May be #{None}, to disable any data loading - | (including the vocabulary). - - +param("Tokenizer") - | A class/function that creates the tokenizer. - - +param("Tagger") - | A class/function that creates the part-of-speech tagger. - - +param("Parser") - | A class/function that creates the dependency parser. - - +param("Entity") - | A class/function that creates the named entity recogniser. - - +param("load_vectors") - | A boolean value to control whether the word vectors are loaded. - - +callable - +method("__call__", "text, tag=True, parse=True, entity=True") - - +params - +param("text", types.unicode) - | The text to be processed. No pre-processing needs to be applied, - | and any length of text can be submitted. Usually you will submit - | a whole document. Text may be zero-length. An exception is raised - | if byte strings are supplied. - - +param("tag", bool_type) - | Whether to apply the part-of-speech tagger. Required for parsing - | and entity recognition. - - +param("parse", bool_type) - | Whether to apply the syntactic dependency parser. - - +param("entity", bool_type) - | Whether to apply the named entity recognizer. - - pre.language-python - code - | from spacy.en import English - | nlp = English() - | doc = nlp(u'Some text.) # Applies tagger, parser, entity - | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - | doc = nlp(u'') # Zero-length tokens, not an error - | # doc = nlp(b'Some text') <-- Error: need unicode - | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - - +declare_class("Doc") - p I'm a doc - - +init - +method("__init__", "vocab") - +params - +param("vocab", vocab_type) - | A vocabulary object - - +sequence - +method("__getitem__", "i", types.int) - +returns(types.Token) - - +method("__getitem__", "start_end", types.slice) - +returns(types.Span) - - +method("__iter__") - | Iterate over tokens - - +method("__len__") - | Number of tokens in the document. - - details - summary: h4 Spans - - +attribute("sents", types.generator) - | Iterate over sentences in the document. - - +attribute("ents", types.generator) - | Iterate over named entities in the document. - - +attribute("noun_chunks", types.generator) - - details - summary: h4 Export/Import - - +method("to_array", "attr_ids") - - | Given a list of M attribute IDs, export the tokens to a numpy ndarray - | of shape N*M, where N is the length of the sentence. - - +params - +param("attr_ids", "list[int]") - | A list of attribute ID ints. - - +returns("feat_array") - | A feature matrix, with one row per word, and one column per attribute - | indicated in the input attr_ids. - - +method("count_by", "attr_id") - | Produce a dict of {attribute (int): count (ints)} frequencies, keyed - | by the values of the given attribute ID. - - pre.language-python - code - | >>> from spacy.en import English, attrs - | >>> nlp = English() - | >>> tokens = nlp(u'apple apple orange banana') - | >>> tokens.count_by(attrs.ORTH) - | {12800L: 1, 11880L: 2, 7561L: 1} - | >>> tokens.to_array([attrs.ORTH]) - | array([[11880], - | [11880], - | [7561], - | [12800]]) - - +method("from_array", "attrs, array") - | Load from array - - +method("from_bytes") - | Deserialize, loading from bytes - - +method("read_bytes") - | classmethod - - //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - - // | Merge a multi-word expression into a single token. Currently - // | experimental; API is likely to change. - - - +declare_class("Token") - +init - +method("__init__", "vocab, doc, offset") - +params - +param("vocab", types.Vocab) - p A Vocab object - - +param("doc", types.Doc) - p The parent sequence - - +param("offset", types.int) - p The index of the token within the document - - details - summary: h4 String Views - - +attribute("orth / orth_") - | The form of the word with no string normalization or processing, as - | it appears in the string, without trailing whitespace. - - +attribute("lemma / lemma_") - | The "base" of the word, with no inflectional suffixes, e.g. the lemma of - | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - | derivational suffixes are not stripped, e.g. the lemma of - | "instutitions" is "institution", not "institute". Lemmatization is - | performed using the WordNet data, but extended to also cover closed-class - | words such as pronouns. By default, the WN lemmatizer returns "hi" - | as the lemma of "his". We assign pronouns the lemma -PRON-. - - +attribute("lower / lower_") - | The form of the word, but forced to lower-case, i.e. - pre.language-python: code lower = word.orth\_.lower() - - //+attribute("norm / norm_") - // | The form of the word, after language-specific normalizations has been - // | applied. - - +attribute("shape / shape_") - | A transform of the word's string, to show orthographic features. - | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped - | to d. After these mappings, sequences of 4 or more of the same character - | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - | :) --> :) - - +attribute("prefix / prefix_") - | A length-N substring from the start of the word. Length may vary by - | language; currently for English n=1, i.e. - pre.language-python: code prefix = word.orth\_[:1] - - +attribute("suffix / suffix_") - | A length-N substring from the end of the word. Length may vary by - | language; currently for English n=3, i.e. - pre.language-python: code suffix = word.orth\_[-3:] - - //+attribute("lex_id") - // | lex_id - - details - summary: h4 Alignment and Output - - +attribute("idx") - p Start index of the token in the string - - +method("__len__", "") - p Length of the token's orth string, in unicode code-points. - - +method("__unicode__", "") - p Same as token.orth_ - - +method("__str__", "") - p Varies between Python 2 and Python 3 - - +attribute("string") - p - | The form of the word as it appears in the string, including - | trailing whitespace. This is useful when you need to use - | linguistic features to add inline mark-up to the string. - - +method("nbor, i=1") - +params - +param("i") - p Offset relative to token - - details - summary: h4 Distributional Features - - +attribute("repvec") - p - | A "word embedding" representation: a dense real-valued vector that supports - | similarity queries between words. By default, spaCy currently loads - | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - | model. - - +attribute("cluster") - p - | The Brown cluster ID of the word. These are often useful features for - | linear models. If you're using a non-linear model, particularly a - | neural net or random forest, consider using the real-valued word - | representation vector, in Token.repvec, instead. - - +attribute("prob") - p - | The unigram log-probability of the word, estimated from counts from a - | large corpus, smoothed using Simple Good Turing estimation. - - details - summary: h4 Syntactic Tags - - +attribute("pos / pos_") - p - | A part-of-speech tag, from the Google Universal Tag Set, e.g. - | code>NOUN, VERB, ADV. Constants for - | the 17 tag values are provided in spacy.parts_of_speech. - - +attribute("tag / tag_") - p - | A morphosyntactic tag, e.g. NN, VBZ, - | DT, etc. These tags are language/corpus specific, and - | typically describe part-of-speech and some amount of morphological - | information. For instance, in the Penn Treebank tag set, VBZ - | is assigned to a present-tense singular verb. - - +attribute("dep / dep_") - p - | The type of syntactic dependency relation between the word and its - | syntactic head. - - details - summary: h4 Navigating the Parse Tree - - +attribute("head") - p - | The Token that is the immediate syntactic head of the word. If the - | word is the root of the dependency tree, the same word is returned. - - +attribute("lefts") - p - | An iterator for the immediate leftward syntactic children of the - | word. - - +attribute("rights") - p - | An iterator for the immediate rightward syntactic children of the - | word. - - +attribute("n_lefts") - p - | The number of immediate syntactic children preceding the word in - | the string. - - +attribute("n_rights") - p - | The number of immediate syntactic children following the word in - | the string. - - +attribute("children") - p - | An iterator that yields from lefts, and then yields from rights. - - +attribute("subtree") - p - | An iterator for the part of the sentence syntactically governed by - | the word, including the word itself. - - +attribute("left_edge") - p The leftmost edge of the token's subtree - - +attribute("right_edge") - p The rightmost edge of the token's subtree - - details - summary: h4 Named Entities - - +attribute("ent_type") - p If the token is part of an entity, its entity type. - - +attribute("ent_iob") - p The IOB (inside, outside, begin) entity recognition tag for the token. - - details - summary: h4 Lexeme Flags - - +method("check_flag", "flag_id") - +params - +param("flag_id") - | flag ID - - +attribute("is_oov") - +attribute("is_alpha") - +attribute("is_ascii") - +attribute("is_digit") - +attribute("is_lower") - +attribute("is_title") - +attribute("is_punct") - +attribute("is_space") - +attribute("like_url") - +attribute("like_num") - +attribute("like_email") - - //+attribute("conjuncts") - // | Conjuncts - - +declare_class("Span") - +init - +method("__init__") - Temp - - span = doc[0:4] - - +sequence - +method("__getitem__") - p Get item - - +method("__iter__") - p Iter - - +method("__len__") - p Len - - details - summary: h4 Parse - - +attribute("root") - p Syntactic head - - +attribute("lefts") - p Tokens that are: - ol - li To the left of the span; - li Syntactic children of words within the span - - p i.e. - - pre.language-python - code - | lefts = [span.doc[i] for i in range(0, span.start) - | if span.doc[i].head in span] - - +attribute("rights") - p Tokens that are: - ol - li To the right of the span; - li Syntactic children of words within the span - p i.e. - pre.language-python - code - | rights = [span.doc[i] for i in range(span.end, len(span.doc)) - | if span.doc[i].head in span] - - - +attribute("subtree") - p String - - details - summary: h4 String Views - - +attribute("string") - p String - - +attribute("lemma / lemma_") - p String - - +attribute("label / label_") - p String - - +declare_class("Lexeme") - p - | The Lexeme object represents a lexical type, stored in the vocabulary - | – as opposed to a token, occurring in a document. - p - | Lexemes store various features, so that these features can be computed - | once per type, rather than once per token. As job sizes grow, this - | can amount to a substantial efficiency improvement. - - p - | All Lexeme attributes are therefore context independent, as a single - | lexeme is reused for all usages of that word. Lexemes are keyed by - | the “orth” attribute. - - p - All Lexeme attributes are accessible directly on the Token object. - - +init - +method("__init__") - p Init - - details - summary: h4 String Features - - +attribute("orth / orth_") - p - | The form of the word with no string normalization or processing, - | as it appears in the string, without trailing whitespace. - - +attribute("lower / lower_") - p Tmp - - +attribute("norm / norm_") - p Tmp - - +attribute("shape / shape_") - p Tmp - - +attribute("prefix / prefix_") - p Tmp - - +attribute("suffix / suffix_") - p TMP - - +declare_class("Vocab", "data_dir=None, lex_props_getter=None") - +sequence - +method("__len__") - +returns - p Number of words in the vocabulary. - - +method("__iter__") - +returns - p Lexeme - - +maptype - +method("__getitem__", "key_int") - +params - +param("key") - p Integer ID - - +returns: p A Lexeme object - - +method("__getitem__", "key_str") - +params - +param("key_str", types.unicode) - p A string in the vocabulary - - +returns("Lexeme") - - +method("__setitem__", "orth_str", "props") - +params - +param("orth_str", types.unicode) - p The orth key - - +param("props", types.dict) - p A props dictionary - - +returns("None") - - details - summary: h4 Import/Export - - +method("dump", "loc") - +params - +param("loc", types.unicode) - p Path where the vocabulary should be saved - - +method("load_lexemes", "loc") - +params - +param("loc", types.unicode) - p Path to load the lexemes.bin file from - - +method("load_vectors", "loc") - +params - +param("loc", types.unicode) - p Path to load the vectors.bin from - - +declare_class("StringStore") - +init - Tmp - - +sequence - +method("__len__") - +returns("int") - p Number of strings in the string-store - - +method("__iter__") - +returns - p Lexeme - - +maptype - +method("__getitem__", "key_int") - +params - +param("key_int") - p An integer key - - +returns(types.unicode) - p The string that the integer key maps to - - +method("__getitem__", "key_unicode") - +params - +param("key_unicode") - p A key, as a unicode string - - +returns(types.int) - p The integer ID of the string. - - +method("__getitem__", "key_utf8_bytes") - +params - +param("key_utf8_bytes", types.bytes) - p p A key, as a UTF-8 encoded byte-string - - +returns(types.int) - p The integer ID of the string. - - details - summary: h4 Import/Export - - +method("dump", "loc") - +params - +param("loc") - p File path to save the strings.txt to. - - +method("load") - +params - +param("loc") - p File path to load the strings.txt from. - - script(src="js/prism.js") + +Section("API", "api", "api.jade") + +Section("Tutorals", "tutorials", "tutorials.jade") + +Section("Annotation Specifications", "spec", "spec.jade") diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade index c89d830cd..a628da2db 100644 --- a/docs/redesign/home.jade +++ b/docs/redesign/home.jade @@ -1,5 +1,7 @@ extends ./outline.jade +include ./mixins.jade + // Notes // // 1. Where to put version notice? Should say something like @@ -16,11 +18,13 @@ mixin lede - var state_of_the_art = 'state-of-the-art' - var a_minor_miracle = 'a minor miracle' - var great_documentation = 'great documentation' + - var concise_API = 'concise API' p. spaCy is a - library for industrial-strength NLP in Python and Cython. It features - !{state_of_the_art} speed and accuracy, a concise API, and great documentation. + library for industrial-strength natural language processing in Python and + Cython. It features !{state_of_the_art} speed and accuracy, a !{concise_API}, + and license terms designed to get out of your way. If you're a small company doing NLP, we want spaCy to seem like !{a_minor_miracle}. @@ -36,7 +40,6 @@ mixin get_started() p. Get Started - mixin comparison(name) details summary @@ -68,19 +71,6 @@ mixin social | Discuss on Reddit -mixin Section(title_text, link_name, include_file) - a(name=link_name): h3 #{title_text} - - if (link_name == "example-use") - include ./usage_examples.jade - else if (link_name == "online-demo") - include ./online_demo.jade - else if (link_name == "comparisons") - include ./comparisons.jade - else if (link_name == "install") - include ./installation.jade - - block intro_block section(class="intro") +lede @@ -90,7 +80,9 @@ block intro_block li: a(href="#example-use" class="button") Examples li: a(href="#online-demo" class="button") Demo li: a(href="#comparisons" class="button") Comparisons - li: a(href="#install" class="button") Install v0.89 + li: a(href="#install" class="button") + | Install + v0.89 block body_block @@ -103,4 +95,3 @@ block body_block +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") +Section("Install", "install", "./install.jade") - diff --git a/docs/redesign/installation.jade b/docs/redesign/installation.jade index 05f89dd24..50736e0ff 100644 --- a/docs/redesign/installation.jade +++ b/docs/redesign/installation.jade @@ -1,40 +1,71 @@ -p With Python 2.7 or Python 3, using Linux or OSX, run: +mixin Option(name, open) + details(open=open) + summary + h4= name + block -pre.language-bash: code - | $ pip install spacy - | $ python -m spacy.en.download ++Option("conda", true) + pre.language-bash: code + | $ conda install spacy + | $ python -m spacy.en.download -p - | The download command fetches and installs about 300mb of data, for - | the parser model and word vectors, which it installs within the spacy.en - | package directory. ++Option("pip and virtualenv", true) + p With Python 2.7 or Python 3, using Linux or OSX, run: -p - | If you're stuck using a server with an old version of Python, and you - | don't have root access, I've prepared a bootstrap script to help you - | compile a local Python install. Run: + pre.language-bash: code + | $ pip install spacy + | $ python -m spacy.en.download -pre.language-bash: code - | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate + p + | The download command fetches and installs about 300mb of data, for + | the parser model and word vectors, which it installs within the spacy.en + | package directory. -p - | The other way to install the package is to clone the github repository, - | and build it from source. This installs an additional dependency, - | Cython. If you're using Python 2, I also recommend installing fabric - | and fabtools – this is how I build the project. -pre.language-bash: code - | $ git clone https://github.com/honnibal/spaCy.git - | $ cd spaCy - | $ virtualenv .env && source .env/bin/activate - | $ export PYTHONPATH=`pwd` - | $ pip install -r requirements.txt - | $ python setup.py build_ext --inplace - | $ python -m spacy.en.download - | $ pip install pytest - | $ py.test tests/ + +Option("Workaround for obsolete system Python", false) + p + | If you're stuck using a server with an old version of Python, and you + | don't have root access, I've prepared a bootstrap script to help you + | compile a local Python install. Run: + + pre.language-bash: code + | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate + + + ++Option("Compile from source", false) + p + | The other way to install the package is to clone the github repository, + | and build it from source. This installs an additional dependency, + | Cython. If you're using Python 2, I also recommend installing fabric + | and fabtools – this is how I build the project. + + pre.language-bash: code + | $ git clone https://github.com/honnibal/spaCy.git + | $ cd spaCy + | $ virtualenv .env && source .env/bin/activate + | $ export PYTHONPATH=`pwd` + | $ pip install -r requirements.txt + | $ python setup.py build_ext --inplace + | $ python -m spacy.en.download + | $ pip install pytest + | $ py.test tests/ + + p + | Python packaging is awkward at the best of times, and it's particularly tricky + | with C extensions, built via Cython, requiring large data files. So, + | please report issues as you encounter them. + ++Option("pypy (Unsupported)") + | If PyPy support is a priority for you, please get in touch. We could likely + | fix the remaining issues, if necessary. However, the library is likely to + | be much slower on PyPy, as it's written in Cython, which produces code tuned + | for the performance of CPython. + ++Option("Windows (Unsupported)") + | Unfortunately we don't currently have access to a Windows machine, and have + | no experience developing on a MicroSoft stack. In theory the only problems are + | with the installation and packaging – there should be no deep platform + | dependency. Unfortunately we can't debug these issues at present, simply due + | to lack of a development environment. -p - | Python packaging is awkward at the best of times, and it's particularly tricky - | with C extensions, built via Cython, requiring large data files. So, - | please report issues as you encounter them. diff --git a/docs/redesign/license.jade b/docs/redesign/license.jade new file mode 100644 index 000000000..d8dc2135e --- /dev/null +++ b/docs/redesign/license.jade @@ -0,0 +1,179 @@ +extends ./outline.jade + +mixin columns(...names) + tr + each name in names + th= name + + +mixin row(...cells) + tr + each cell in cells + td= cell + + +mixin LicenseOption(name, period, price, audience) + .item + h4 #{name} + + .focus #{period} + + span #{price} + + h5 Suggested for: + + span #{audience} + + a.button(href="spacy_trial_free.docx") Download license + + span or + a(href="#") get in touch + + +block body_block + article.pricing + + .box.license + +LicenseOption("Trial", "90 days", "$0", "Evaluation") + +LicenseOption("Production", "1 year", "$5,000", "Production") + +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") + + p.caption + | Researcher, hobbyist, or open-source developer? spaCy also offers + a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3 + | licenses. + + p. + What we offer is a rare, simple certainty: a long-term, permissive license + that comes with full access to the source, complete transparency, and almost + complete flexibility. The difference between this and a black-box API is + night and day. You cannot build a great product against a service you + don't understand, and you can't build a great business on a service you + don't control. + + p + | Let's face it: services disappear. Constantly. The good start-ups get + | bought; the bad ones go bankrupt. Open-source projects become abandoned + | or bloated. Google's graveyard is over-flowing – ditto for Yahoo!, + | Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? + + p + | A 5 year license won't expire until 2020. spaCy will be with you for + | longer than most of your current staff. If that's still not enough, + | get in touch. I'm sure we can work something out. + + //p. + // To make spaCy as valuable as possible, licenses to it are for life. You get + // complete transparency, certainty and control. If you need to use spaCy + // as an API, it's trivial to host it yourself – and you don't need to + // worry about the service changing or disappearing. And if you're ever in + // acquisition or IPO talks, the story is simple. + + //p. + // spaCy can also be used as free open-source software, under the Aferro GPL + // license. If you use it this way, you must comply with the AGPL license + // terms. When you distribute your project, or offer it as a network service, + // you must distribute the source-code and grant users an AGPL license to it. + + + //h3 Examples + + //p. + // In order to clarify how spaCy's license structure might apply to you, I've + // written a few examples, in the form of user-stories. + + //details + // summary: h4 Seed stage start-ups + + // p. + // Ashley and Casey have an idea for a start-up. To explore their idea, they + // want to build a minimum viable product they can put in front of potential + // users and investors. + + // p. They have two options. + + // ol + // li + // p. + // Trial commercial license. With a simple form, they can + // use spaCy for 90 days, for a nominal fee of $1. They are free to modify + // spaCy, and they will own the copyright to their modifications for the + // duration of the license. After the trial period elapses, they can either + // pay the license fee, stop using spaCy, release their project under the + // AGPL. + // + // li + // p. + // AGPL. Casey and Pat can instead use spaCy under the AGPL + // license. However, they must then release any code that statically or + // dynamically links to spaCy under the AGPL as well (e.g. if they import + // the module, or import a module that imports it, etc). They also cannot + // use spaCy as a network resource, by running it as a service --- this is + // the loophole that the "A" part of the AGPL is designed to close. + // + // p. + // Ashley and Casey find the AGPL license unattractive for commercial use. + // They decide to take up the trial commercial license. However, over the + // next 90 days, Ashley has to move house twice, and Casey gets sick. By + // the time the trial expires, they still don't have a demo they can show + // investors. They send an email explaining the situation, and a 90 day extension + // to their trial license is granted. + + // p. + // By the time the extension period has elapsed, spaCy has helped them secure + // funding, and they even have a little revenue. They are glad to pay the + // $5,000 commercial license fee. + + // p. + // spaCy is now permanently licensed for the product Ashley and Casey are + // developing. They own the copyright to any modifications they make to spaCy, + // but not to the original spaCy code. + + // p. + // No additional fees will be due when they hire new developers, run spaCy on + // additional internal servers, etc. If their company is acquired, the license + // will be transferred to the company acquiring them. However, to use spaCy + // in another product, they will have to buy a second license. + + + // details + // summary: h4 University academics + + // p. + // Alex and Sasha are post-doctoral researchers working for a university. + // Part of their funding comes from a grant from Google, but Google will not + // own any part of the work that they produce. Their mission is just to write + // papers. + + // p. + // Alex and Sasha find spaCy convenient, so they use it in their system under + // the AGPL. This means that their system must also be released under the + // AGPL, but they're cool with that – they were going to release their + // code anyway, as it's the only way to ensure their experiments are properly + // repeatable. + + // p. + // Alex and Sasha find and fix a few bugs in spaCy. They must release these + // modifications, and they ask that they be accepted into the main spaCy repo. + // In order to do this, they must sign a contributor agreement, ceding their + // copyright. When commercial licenses to spaCy are sold, Alex and Sasha will + // not be able to claim any royalties from their contributions. + + // p. + // Later, Alex and Sasha implement new features into spaCy, for another paper. + // The code was quite rushed, and they don't want to take the time to put + // together a proper pull request. They must release their modifications + // under the AGPL, but they are not obliged to contribute it to the spaCy + // repository, or concede their copyright. + + // details + // summary: h4 Open Source developers + + // p. + // Phuong and Jessie use the open-source software Calibre to manage their + // e-book libraries. They have an idea for a search feature, and they want + // to use spaCy to implement it. Calibre is released under the GPLv3. The + // AGPL has additional restrictions for projects used as a network resource, + // but they don't apply to this project, so Phuong and Jessie can use spaCy + // to improve Calibre. They'll have to release their code, but that was + // always their intention anyway. diff --git a/docs/redesign/mixins.jade b/docs/redesign/mixins.jade new file mode 100644 index 000000000..34ad293aa --- /dev/null +++ b/docs/redesign/mixins.jade @@ -0,0 +1,19 @@ +mixin Section(title_text, link_name, include_file) + h3: a(name=link_name href=link_name) #{title_text} + + if (link_name == "example-use") + include ./usage_examples.jade + else if (link_name == "online-demo") + include ./online_demo.jade + else if (link_name == "comparisons") + include ./comparisons.jade + else if (link_name == "install") + include ./installation.jade + else if (link_name == "api") + include ./api.jade + else if (link_name == "tutorials") + include ./tutorials.jade + else if (link_name == "spec") + include ./spec.jade + + diff --git a/docs/redesign/outline.jade b/docs/redesign/outline.jade index 2389dc71e..1ae9eacfa 100644 --- a/docs/redesign/outline.jade +++ b/docs/redesign/outline.jade @@ -21,10 +21,10 @@ html(lang="en") nav(role="navigation") ul - li: a(href="#") Home - li: a(href="#") Docs - li: a(href="#") License - li: a(href="#") More + li: a(href="home.html") Home + li: a(href="docs.html") Docs + li: a(href="license.html") License + li: a(href="blog.html") Blog main(id="content" role="main") block intro_block diff --git a/docs/redesign/spec.jade b/docs/redesign/spec.jade index a61a4f356..4c459f409 100644 --- a/docs/redesign/spec.jade +++ b/docs/redesign/spec.jade @@ -1,5 +1,3 @@ -extends ./outline.jade - mixin columns(...names) tr each name in names @@ -12,112 +10,120 @@ mixin row(...cells) td= cell -block body_block - article(class="page docs-page") - p. - This document describes the target annotations spaCy is trained to predict. - This is currently a work in progress. Please ask questions on the issue tracker, - so that the answers can be integrated here to improve the documentation. +details + summary: h4 Overview + + p. + This document describes the target annotations spaCy is trained to predict. + This is currently a work in progress. Please ask questions on the issue tracker, + so that the answers can be integrated here to improve the documentation. - h2 Tokenization +details + summary: h4 Tokenization - p Tokenization standards are based on the OntoNotes 5 corpus. + p Tokenization standards are based on the OntoNotes 5 corpus. - p. - The tokenizer differs from most by including tokens for significant - whitespace. Any sequence of whitespace characters beyond a single space - (' ') is included as a token. For instance: + p. + The tokenizer differs from most by including tokens for significant + whitespace. Any sequence of whitespace characters beyond a single space + (' ') is included as a token. For instance: - pre.language-python - code - | from spacy.en import English - | nlp = English(parse=False) - | tokens = nlp('Some\nspaces and\ttab characters') - | print([t.orth_ for t in tokens]) + pre.language-python + code + | from spacy.en import English + | nlp = English(parse=False) + | tokens = nlp('Some\nspaces and\ttab characters') + | print([t.orth_ for t in tokens]) - p Which produces: + p Which produces: - pre.language-python - code - | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] + pre.language-python + code + | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] - p. - The whitespace tokens are useful for much the same reason punctuation is - – it's often an important delimiter in the text. By preserving - it in the token output, we are able to maintain a simple alignment - between the tokens and the original string, and we ensure that no - information is lost during processing. + p. + The whitespace tokens are useful for much the same reason punctuation is + – it's often an important delimiter in the text. By preserving + it in the token output, we are able to maintain a simple alignment + between the tokens and the original string, and we ensure that no + information is lost during processing. - h3 Sentence boundary detection +details + summary: h4 Sentence boundary detection - p. - Sentence boundaries are calculated from the syntactic parse tree, so - features such as punctuation and capitalisation play an important but - non-decisive role in determining the sentence boundaries. Usually this - means that the sentence boundaries will at least coincide with clause - boundaries, even given poorly punctuated text. + p. + Sentence boundaries are calculated from the syntactic parse tree, so + features such as punctuation and capitalisation play an important but + non-decisive role in determining the sentence boundaries. Usually this + means that the sentence boundaries will at least coincide with clause + boundaries, even given poorly punctuated text. - h3 Part-of-speech Tagging +details + summary: h4 Part-of-speech Tagging - p. - The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank - tag set. We also map the tags to the simpler Google Universal POS Tag set. + p. + The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank + tag set. We also map the tags to the simpler Google Universal POS Tag set. - Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 + p. + Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - h3 Lemmatization +details + summary: h4 Lemmatization - p. - A "lemma" is the uninflected form of a word. In English, this means: + p. + A "lemma" is the uninflected form of a word. In English, this means: - ul - li Adjectives: The form like "happy", not "happier" or "happiest" - li Adverbs: The form like "badly", not "worse" or "worst" - li Nouns: The form like "dog", not "dogs"; like "child", not "children" - li Verbs: The form like "write", not "writes", "writing", "wrote" or "written" + ul + li Adjectives: The form like "happy", not "happier" or "happiest" + li Adverbs: The form like "badly", not "worse" or "worst" + li Nouns: The form like "dog", not "dogs"; like "child", not "children" + li Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - p. - The lemmatization data is taken from WordNet. However, we also add a - special case for pronouns: all pronouns are lemmatized to the special - token -PRON-. + p. + The lemmatization data is taken from WordNet. However, we also add a + special case for pronouns: all pronouns are lemmatized to the special + token -PRON-. - h3 Syntactic Dependency Parsing +details + summary: h4 Syntactic Dependency Parsing - p. - The parser is trained on data produced by the ClearNLP converter. Details - of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf + p. + The parser is trained on data produced by the ClearNLP converter. Details + of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - h3 Named Entity Recognition +details + summary: h4 Named Entity Recognition - table - thead - +columns("Entity Type", "Description") + table + thead + +columns("Entity Type", "Description") - tbody - +row("PERSON", "People, including fictional.") - +row("NORP", "Nationalities or religious or political groups.") - +row("FACILITY", "Buildings, airports, highways, bridges, etc.") - +row("ORG", "Companies, agencies, institutions, etc.") - +row("GPE", "Countries, cities, states.") - +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") - +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") - +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") - +row("WORK_OF_ART", "Titles of books, songs, etc.") - +row("LAW", "Named documents made into laws") - +row("LANGUAGE", "Any named language") + tbody + +row("PERSON", "People, including fictional.") + +row("NORP", "Nationalities or religious or political groups.") + +row("FACILITY", "Buildings, airports, highways, bridges, etc.") + +row("ORG", "Companies, agencies, institutions, etc.") + +row("GPE", "Countries, cities, states.") + +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") + +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") + +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") + +row("WORK_OF_ART", "Titles of books, songs, etc.") + +row("LAW", "Named documents made into laws") + +row("LANGUAGE", "Any named language") - p The following values are also annotated in a style similar to names: + p The following values are also annotated in a style similar to names: - table - thead - +columns("Entity Type", "Description") + table + thead + +columns("Entity Type", "Description") - tbody - +row("DATE", "Absolute or relative dates or periods") - +row("TIME", "Times smaller than a day") - +row("PERCENT", 'Percentage (including “%”)') - +row("MONEY", "Monetary values, including unit") - +row("QUANTITY", "Measurements, as of weight or distance") - +row("ORDINAL", 'first", "second"') - +row("CARDINAL", "Numerals that do not fall under another type") + tbody + +row("DATE", "Absolute or relative dates or periods") + +row("TIME", "Times smaller than a day") + +row("PERCENT", 'Percentage (including “%”)') + +row("MONEY", "Monetary values, including unit") + +row("QUANTITY", "Measurements, as of weight or distance") + +row("ORDINAL", 'first", "second"') + +row("CARDINAL", "Numerals that do not fall under another type") diff --git a/docs/redesign/template_post.jade b/docs/redesign/template_post.jade new file mode 100644 index 000000000..0012d24b7 --- /dev/null +++ b/docs/redesign/template_post.jade @@ -0,0 +1,31 @@ +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog(role="document") + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + + nav(role="navigation") + ul + li: a(href="home.html") Home + li: a(href="docs.html") Docs + li.active: a(href="blog.html") Blog + li: a(href="license.html") License + + main#content(role='main') + block intro_block + + block body_block + + footer(role='contentinfo') + + script(src="js/prism.js") + script(src="js/details_polyfill.js") diff --git a/docs/redesign/tute_adverbs.jade b/docs/redesign/tute_adverbs.jade new file mode 100644 index 000000000..c81931b0e --- /dev/null +++ b/docs/redesign/tute_adverbs.jade @@ -0,0 +1,200 @@ +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + article.post + + + :markdown-it + # Adverbs + + Let's say you're developing a proofreading tool, or possibly an IDE for + writers. You're convinced by Stephen King's advice that `adverbs are + not your friend `_, + so you want to **highlight all adverbs**. We'll use one of the examples + he finds particularly egregious: + + pre.language-python + code + | import spacy.en + | >>> from spacy.parts_of_speech import ADV + | >>> # Load the pipeline, and call it with some text. + | >>> nlp = spacy.en.English() + | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False) + | >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) + | u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ + + :markdown-it + Easy enough --- but the problem is that we've also highlighted "back". + While "back" is undoubtedly an adverb, we probably don't want to highlight + it. If what we're trying to do is flag dubious stylistic choices, we'll + need to refine our logic. It turns out only a certain type of adverb + is of interest to us. + + + :markdown-it + There are lots of ways we might do this, depending on just what words + we want to flag. The simplest way to exclude adverbs like "back" and + "not" is by word frequency: these words are much more common than the + prototypical manner adverbs that the style guides are worried about. + + :markdown-it + The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a + log probability estimate of the word: + + pre.language-python + code + | >>> nlp.vocab[u'back'].prob + | -7.403977394104004 + | >>> nlp.vocab[u'not'].prob + | -5.407193660736084 + | >>> nlp.vocab[u'quietly'].prob + | -11.07155704498291 + + :markdown-it + (The probability estimate is based on counts from a 3 billion word corpus, + smoothed using the `Simple Good-Turing`_ method.) + + So we can easily exclude the N most frequent words in English from our + adverb marker. Let's try N=1000 for now: + + pre.language-python + code + | >>> import spacy.en + | >>> from spacy.parts_of_speech import ADV + | >>> nlp = spacy.en.English() + | >>> # Find log probability of Nth most frequent word + | >>> probs = [lex.prob for lex in nlp.vocab] + | >>> probs.sort() + | >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] + | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") + | >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) + | ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ + + :markdown-it + There are lots of other ways we could refine the logic, depending on + just what words we want to flag. Let's say we wanted to only flag + adverbs that modified words similar to "pleaded". This is easy to do, + as spaCy loads a vector-space representation for every word (by default, + the vectors produced by `Levy and Goldberg (2014)`_). Naturally, the + vector is provided as a numpy array: + + pre.language-python + code + | >>> pleaded = tokens[7] + | >>> pleaded.repvec.shape + | (300,) + | >>> pleaded.repvec[:5] + | array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32) + + :markdown-it + We want to sort the words in our vocabulary by their similarity to + "pleaded". There are lots of ways to measure the similarity of two + vectors. We'll use the cosine metric: + + pre.language-python + code + | >>> from numpy import dot + | >>> from numpy.linalg import norm + + | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) + | >>> words = [w for w in nlp.vocab if w.has_repvec] + | >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) + | >>> words.reverse() + | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) + | 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading + | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) + | 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses + | >>> print('100-110', ', '.join(w.orth_ for w in words[100:110])) + | 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes + | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) + | 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged + | >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010])) + | 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists + + :markdown-it + As you can see, the similarity model that these vectors give us is excellent + --- we're still getting meaningful results at 1000 words, off a single + prototype! The only problem is that the list really contains two clusters of + words: one associated with the legal meaning of "pleaded", and one for the more + general sense. Sorting out these clusters is an area of active research. + + A simple work-around is to average the vectors of several words, and use that + as our target: + + pre.language-python + code + | >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] + | >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) + | >>> words.sort(key=lambda w: cosine(w.repvec * say_vector)) + | >>> words.reverse() + | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) + | 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired + | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) + | 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed + | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) + | 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate + + :markdown-it + These definitely look like words that King might scold a writer for attaching + adverbs to. Recall that our original adverb highlighting function looked like + this: + + pre.language-python + code + | >>> import spacy.en + | >>> from spacy.parts_of_speech import ADV + | >>> # Load the pipeline, and call it with some text. + | >>> nlp = spacy.en.English() + | >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", + | tag=True, parse=False) + | >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) + | ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ + + + :markdown-it + We wanted to refine the logic so that only adverbs modifying evocative + verbs of communication, like "pleaded", were highlighted. We've now + built a vector that represents that type of word, so now we can highlight + adverbs based on subtle logic, honing in on adverbs that seem the most + stylistically problematic, given our starting assumptions: + + pre.language-python + code + | >>> import numpy + | >>> from numpy import dot + | >>> from numpy.linalg import norm + | >>> import spacy.en + | >>> from spacy.parts_of_speech import ADV, VERB + | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) + | >>> def is_bad_adverb(token, target_verb, tol): + | ... if token.pos != ADV + | ... return False + | ... elif token.head.pos != VERB: + | ... return False + | ... elif cosine(token.head.repvec, target_verb) < tol: + | ... return False + | ... else: + | ... return True + + :markdown-it + This example was somewhat contrived --- and, truth be told, I've never + really bought the idea that adverbs were a grave stylistic sin. But + hopefully it got the message across: the state-of-the-art NLP technologies + are very powerful. spaCy gives you easy and efficient access to them, + which lets you build all sorts of useful products and features that + were previously impossible. + + footer(role='contentinfo') + script(src='js/prism.js') diff --git a/docs/redesign/tute_syntax_search.jade b/docs/redesign/tute_syntax_search.jade new file mode 100644 index 000000000..c3679b83d --- /dev/null +++ b/docs/redesign/tute_syntax_search.jade @@ -0,0 +1,132 @@ +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + section.intro + p + | Example use of the spaCy NLP tools for data exploration. + | Here we will look for reddit comments that describe Google doing something, + | i.e. discuss the company's actions. This is difficult, because other senses of + | "Google" now dominate usage of the word in conversation, particularly references to + | using Google products. + + p + | The heuristics used are quick and dirty – about 5 minutes work. + + //| A better approach is to use the word vector of the verb. But, the + // | demo here is just to show what's possible to build up quickly, to + // | start to understand some data. + + article.post + header + h2 Syntax-specific Search + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2015-08-14') August + + details + summary: h4 Imports + + pre.language-python + code + | from __future__ import unicode_literals + | from __future__ import print_function + | import sys + | + | import plac + | import bz2 + | import ujson + | import spacy.en + + details + summary: h4 Load the model and iterate over the data + + pre.language-python + code + | def main(input_loc): + | nlp = spacy.en.English() # Load the model takes 10-20 seconds. + | for line in bz2.BZ2File(input_loc): # Iterate over the reddit comments from the dump. + | comment_str = ujson.loads(line)['body'] # Parse the json object, and extract the 'body' attribute. + | + details + summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want + + pre.language-python + code + | comment_parse = nlp(comment_str) + | for word in comment_parse: + | if google_doing_something(word): + | # Print the clause + | print(''.join(w.string for w in word.head.subtree).strip()) + details + summary: h4 Define the filter function + + pre.language-python + code + + | + | def google_doing_something(w): + | if w.lower_ != 'google': + | return False + | # Is it the subject of a verb? + | elif w.dep_ != 'nsubj': + | return False + | # And not 'is' + | elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux': + | return False + | # Exclude e.g. "Google says..." + | elif w.head.lemma_ in ('say', 'show'): + | return False + | else: + | return True + | + | + + details + summary: h4 Call main + + pre.language-python + code + | if __name__ == '__main__': + | plac.call(main) + + details + summary: h4 Example output + + p. + Many false positives remain. Some are from incorrect interpretations + of the sentence by spaCy, some are flaws in our filtering logic. But + the results are vastly better than a string-based search, which returns + almost no examples of the pattern we're looking for. + + code + | Google dropped support for Android < 4.0 already + | google drive + | Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc + | When Google responds + | Google translate cyka pasterino. + | A quick google looks like Synology does have a sync'ing feature which does support block level so that should work + | (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible? + | Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop. + | Google offers something like this already, but it is truly terrible. + | google isn't helping me + | Google tells me: 0 results, 250 pages removed from google. + | how did Google swoop in and eat our lunch + + + + script(src="js/prism.js") + script(src="js/details_polyfill.js") diff --git a/docs/redesign/tute_twitter.jade b/docs/redesign/tute_twitter.jade new file mode 100644 index 000000000..f8f849eed --- /dev/null +++ b/docs/redesign/tute_twitter.jade @@ -0,0 +1,204 @@ +doctype html +html(lang='en') + head + meta(charset='utf-8') + title spaCy Blog + meta(name='description', content='') + meta(name='author', content='Matthew Honnibal') + link(rel='stylesheet', href='css/style.css') + //if lt IE 9 + script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') + body#blog + header(role='banner') + h1.logo spaCy Blog + .slogan Blog + main#content(role='main') + article.post + header + h2 Finding Relevant Tweets + .subhead + | by + a(href='#', rel='author') Matthew Honnibal + | on + time(datetime='2015-08-14') December + + details + summary: h4 Imports + pre.language-python + + | from __future__ import unicode_literals, print_function + | import plac + | import codecs + | import sys + | import math + | + | import spacy.en + | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ + | + | from termcolor import colored + | from twython import TwythonStreamer + | + | from os import path + | from math import sqrt + | + | from numpy import dot + | from numpy.linalg import norm + | + | + + details + summary: h4 Simple vector-averaging similarity + + pre.language-python: code + + | class Meaning(object): + | def __init__(self, vectors): + | if vectors: + | self.vector = sum(vectors) / len(vectors) + | self.norm = norm(self.vector) + | else: + | self.vector = None + | self.norm = 0 + | + | @classmethod + | def from_path(cls, nlp, loc): + | with codecs.open(loc, 'r', 'utf8') as file_: + | terms = file_.read().strip().split() + | return cls.from_terms(nlp, terms) + | + | @classmethod + | def from_tokens(cls, nlp, tokens): + | vectors = [t.repvec for t in tokens] + | return cls(vectors) + | + | @classmethod + | def from_terms(cls, nlp, examples): + | lexemes = [nlp.vocab[eg] for eg in examples] + | vectors = [eg.repvec for eg in lexemes] + | return cls(vectors) + | + | def similarity(self, other): + | if not self.norm or not other.norm: + | return -1 + | return dot(self.vector, other.vector) / (self.norm * other.norm) + | + + details + summary: h4 Print matches + + pre.language-python: code + + | + | def print_colored(model, stream=sys.stdout): + | if model['is_match']: + | color = 'green' + | elif model['is_reject']: + | color = 'red' + | else: + | color = 'grey' + | + | if not model['is_rare'] and model['is_match'] and not model['is_reject']: + | match_score = colored('%.3f' % model['match_score'], 'green') + | reject_score = colored('%.3f' % model['reject_score'], 'red') + | prob = '%.5f' % model['prob'] + | + | print(match_score, reject_score, prob) + | print(repr(model['text']), color) + | print('') + | + | + + details + summary: h4 TextMatcher: Process the tweets using spaCy + + pre.language-python: code + + | class TextMatcher(object): + | def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): + | self.nlp = nlp + | self.get_target = get_target + | self.get_reject = get_reject + | self.min_prob = min_prob + | self.min_match = min_match + | self.max_reject = max_reject + | + | def __call__(self, text): + | tweet = self.nlp(text) + | target_terms = self.get_target() + | reject_terms = self.get_reject() + | + | prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) + | meaning = Meaning.from_tokens(self, tweet) + | + | match_score = meaning.similarity(self.get_target()) + | reject_score = meaning.similarity(self.get_reject()) + | return { + | 'text': tweet.string, + | 'prob': prob, + | 'match_score': match_score, + | 'reject_score': reject_score, + | 'is_rare': prob < self.min_prob, + | 'is_match': prob >= self.min_prob and match_score >= self.min_match, + | 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject + | } + | + | + + details + summary: h4 Connect to Twitter and stream tweets + + pre.language-python: code + + | class Connection(TwythonStreamer): + | def __init__(self, keys_dir, handler, view): + | keys = Secrets(keys_dir) + | TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) + | self.handler = handler + | self.view = view + | + | def on_success(self, data): + | text = data.get('text', u'') + | # Twython returns either bytes or unicode, depending on tweet. + | # #APIshaming + | try: + | model = self.handler(text) + | except TypeError: + | model = self.handler(text.decode('utf8')) + | status = self.view(model, sys.stdin) + | + | def on_error(self, status_code, data): + | print(status_code) + | + | + | class Secrets(object): + | def __init__(self, key_dir): + | self.key = open(path.join(key_dir, 'key.txt')).read().strip() + | self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() + | self.token = open(path.join(key_dir, 'token.txt')).read().strip() + | self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() + | + | + + details + summary: h4 Command-line interface + + pre.language-python: code + + | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): + | # We don't need the parser for this demo, so may as well save the loading time + | nlp = spacy.en.English(Parser=None) + | get_target = lambda: Meaning.from_path(nlp, target_loc) + | get_reject = lambda: Meaning.from_path(nlp, reject_loc) + | matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) + | + | twitter = Connection(keys_dir, matcher, print_colored) + | twitter.statuses.filter(track=term) + | + | + | if __name__ == '__main__': + | plac.call(main) + | + + footer(role='contentinfo') + script(src='js/prism.js') + diff --git a/docs/redesign/tutorials.jade b/docs/redesign/tutorials.jade new file mode 100644 index 000000000..e69de29bb diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade index d429339d4..418ca9c57 100644 --- a/docs/redesign/usage_examples.jade +++ b/docs/redesign/usage_examples.jade @@ -106,4 +106,11 @@ mixin example(name) +example("Efficient binary serialization") pre.language-python: code - | + + | byte_string = doc.as_bytes() + | open('/tmp/moby_dick.bin', 'wb').write(byte_string) + + | nlp = spacy.en.English() + | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): + | doc = Doc(nlp.vocab) + | doc.from_bytes(byte_string) From 6cc9e7881b4f44b8b61587b5b9020b769995be5a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Aug 2015 22:03:19 +0200 Subject: [PATCH 070/138] * Work on website --- docs/redesign/blog_tagger.jade | 492 +++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 docs/redesign/blog_tagger.jade diff --git a/docs/redesign/blog_tagger.jade b/docs/redesign/blog_tagger.jade new file mode 100644 index 000000000..63ac8e77e --- /dev/null +++ b/docs/redesign/blog_tagger.jade @@ -0,0 +1,492 @@ +extends ./template_post.jade + +block body_block + - var urls = {} + - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" + + + article.post + header + h2 A good Part-of-Speech tagger in about 200 lines of Python + .subhead + | by + a(href="#" rel="author") Matthew Honnibal + | on + time(datetime='2013-09-11') October 11, 2013 + + p. + Up-to-date knowledge about natural language processing is mostly locked away + in academia. And academics are mostly pretty self-conscious when we write. + We’re careful. We don’t want to stick our necks out too much. But under-confident + recommendations suck, so here’s how to write a good part-of-speech tagger. + + p. + There are a tonne of “best known techniques” for POS tagging, and you should + ignore the others and just use Averaged Perceptron. + + p. + You should use two tags of history, and features derived from the Brown word + clusters distributed here. + + p. + If you only need the tagger to work on carefully edited text, you should + use case-sensitive features, but if you want a more robust tagger you + should avoid them because they’ll make you over-fit to the conventions + of your training domain. Instead, features that ask “how frequently is + this word title-cased, in a large sample from the web?” work well. Then + you can lower-case your comparatively tiny training corpus. + + p. + For efficiency, you should figure out which frequent words in your training + data have unambiguous tags, so you don’t have to do anything but output + their tags when they come up. About 50% of the words can be tagged that way. + + p. + And unless you really, really can’t do without an extra 0.1% of accuracy, + you probably shouldn’t bother with any kind of search strategy you should + just use a greedy model. + + p. + If you do all that, you’ll find your tagger easy to write and understand, + and an efficient Cython implementation will perform as follows on the standard + evaluation, 130,000 words of text from the Wall Street Journal: + + table + thead + tr + th Tagger + th Accuracy + th Time (130k words) + tbody + tr + td CyGreedyAP + td 97.1% + td 4s + + p. + The 4s includes initialisation time — the actual per-token speed is high + enough to be irrelevant; it won’t be your bottleneck. + + p. + It’s tempting to look at 97% accuracy and say something similar, but that’s + not true. My parser is about 1% more accurate if the input has hand-labelled + POS tags, and the taggers all perform much worse on out-of-domain data. + Unfortunately accuracies have been fairly flat for the last ten years. + That’s why my recommendation is to just use a simple and fast tagger that’s + roughly as good. + + p. + The thing is though, it’s very common to see people using taggers that + aren’t anywhere near that good! For an example of what a non-expert is + likely to use, these were the two taggers wrapped by TextBlob, a new Python + api that I think is quite neat: + + table + thead + tr + th Tagger + th Accuracy + th Time (130k words) + tbody + tr + td NLTK + td 94.0% + td 3m56s + tr + td Pattern + td 93.5% + td 26s + + p. + Both Pattern and NLTK are very robust and beautifully well documented, so + the appeal of using them is obvious. But Pattern’s algorithms are pretty + crappy, and NLTK carries tremendous baggage around in its implementation + because of its massive framework, and double-duty as a teaching tool. + + p. + As a stand-alone tagger, my Cython implementation is needlessly complicated + – it was written for my parser. So today I wrote a 200 line version + of my recommended algorithm for TextBlob. It gets: + + table + thead + tr + th Tagger + th Accuracy + th Time (130k words) + tbody + tr + td PyGreedyAP + td 96.8% + td 12s + + p. + I traded some accuracy and a lot of efficiency to keep the implementation + simple. Here’s a far-too-brief description of how it works. + + h3 Averaged perceptron + + p. + POS tagging is a “supervised learning problem”. You’re given a table of data, + and you’re told that the values in the last column will be missing during + run-time. You have to find correlations from the other columns to predict + that value. + + p. + So for us, the missing column will be “part of speech at word i“. The predictor + columns (features) will be things like “part of speech at word i-1“, “last three + letters of word at i+1“, etc + + p. + First, here’s what prediction looks like at run-time: + + pre.language-python + code + | def predict(self, features): + | '''Dot-product the features and current weights and return the best class.''' + | scores = defaultdict(float) + | for feat in features: + | if feat not in self.weights: + | continue + | weights = self.weights[feat] + | for clas, weight in weights.items(): + | scores[clas] += weight + | # Do a secondary alphabetic sort, for stability + | return max(self.classes, key=lambda clas: (scores[clas], clas)) + + p. + Earlier I described the learning problem as a table, with one of the columns + marked as missing-at-runtime. For NLP, our tables are always exceedingly + sparse. You have columns like “word i-1=Parliament”, which is almost always + 0. So our “weight vectors” can pretty much never be implemented as vectors. + Map-types are good though — here we use dictionaries. + + p. + The input data, features, is a set with a member for every non-zero “column” + in our “table” – every active feature. Usually this is actually a dictionary, + to let you set values for the features. But here all my features are binary + present-or-absent type deals. + + p. + The weights data-structure is a dictionary of dictionaries, that ultimately + associates feature/class pairs with some weight. You want to structure it + this way instead of the reverse because of the way word frequencies are + distributed: most words are rare, frequent words are very frequent. + + h3 Learning the weights + + p. + Okay, so how do we get the values for the weights? We start with an empty + weights dictionary, and iteratively do the following: + + ol + li Receive a new (features, POS-tag) pair + li Guess the value of the POS tag given the current “weights” for the features + li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class. + + + p. + It’s one of the simplest learning algorithms. Whenever you make a mistake, + increment the weights for the correct class, and penalise the weights that + led to your false prediction. In code: + + pre.language-python + code + | def train(self, nr_iter, examples): + | for i in range(nr_iter): + | for features, true_tag in examples: + | guess = self.predict(features) + | if guess != true_tag: + | for f in features: + | self.weights[f][true_tag] += 1 + | self.weights[f][guess] -= 1 + | random.shuffle(examples) + p. + If you iterate over the same example this way, the weights for the correct + class would have to come out ahead, and you’d get the example right. If + you think about what happens with two examples, you should be able to + see that it will get them both right unless the features are identical. + In general the algorithm will converge so long as the examples are + linearly separable, although that doesn’t matter for our purpose. + + h3 Averaging the weights + + p. + We need to do one more thing to make the perceptron algorithm competitive. + The problem with the algorithm so far is that if you train it twice on + slightly different sets of examples, you end up with really different models. + It doesn’t generalise that smartly. And the problem is really in the later + iterations — if you let it run to convergence, it’ll pay lots of attention + to the few examples it’s getting wrong, and mutate its whole model around + them. + + p. + So, what we’re going to do is make the weights more "sticky" – give + the model less chance to ruin all its hard work in the later rounds. And + we’re going to do that by returning the averaged weights, not the final + weights. + + p. + I doubt there are many people who are convinced that’s the most obvious + solution to the problem, but whatever. We’re not here to innovate, and this + way is time tested on lots of problems. If you have another idea, run the + experiments and tell us what you find. Actually I’d love to see more work + on this, now that the averaged perceptron has become such a prominent learning + algorithm in NLP. + + p. + Okay. So this averaging. How’s that going to work? Note that we don’t want + to just average after each outer-loop iteration. We want the average of all + the values — from the inner loop. So if we have 5,000 examples, and we train + for 10 iterations, we’ll average across 50,000 values for each weight. + + p. + Obviously we’re not going to store all those intermediate values. Instead, + we’ll track an accumulator for each weight, and divide it by the number of + iterations at the end. Again: we want the average weight assigned to a + feature/class pair during learning, so the key component we need is the total + weight it was assigned. But we also want to be careful about how we compute + that accumulator, too. On almost any instance, we’re going to see a tiny + fraction of active feature/class pairs. All the other feature/class weights + won’t change. So we shouldn’t have to go back and add the unchanged value + to our accumulators anyway, like chumps. + + p. + Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain + another dictionary that tracks how long each weight has gone unchanged. Now + when we do change a weight, we can do a fast-forwarded update to the accumulator, + for all those iterations where it lay unchanged. + + p. + Here’s what a weight update looks like now that we have to maintain the + totals and the time-stamps: + + pre.language-python + code + | def update(self, truth, guess, features): + | def upd_feat(c, f, v): + | nr_iters_at_this_weight = self.i - self._timestamps[f][c] + | self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c] + | self.weights[f][c] += v + | self._timestamps[f][c] = self.i + + | self.i += 1 + | for f in features: + | upd_feat(truth, f, 1.0) + | upd_feat(guess, f, -1.0) + + h3 Features and pre-processing + + p. + The POS tagging literature has tonnes of intricate features sensitive to + case, punctuation, etc. They help on the standard test-set, which is from + Wall Street Journal articles from the 1980s, but I don’t see how they’ll + help us learn models that are useful on other text. + + p. + To help us learn a more general model, we’ll pre-process the data prior + to feature extraction, as follows: + + ul + li All words are lower cased; + li Digits in the range 1800-2100 are represented as !YEAR; + li Other digit strings are represented as !DIGITS + li + | It would be better to have a module recognising dates, phone numbers, + | emails, hash-tags, etc. but that will have to be pushed back into the + | tokenization. + + p. + I played around with the features a little, and this seems to be a reasonable + bang-for-buck configuration in terms of getting the development-data accuracy + to 97% (where it typically converges anyway), and having a smaller memory + foot-print: + + pre.language-python + code + | def _get_features(self, i, word, context, prev, prev2): + | '''Map tokens-in-contexts into a feature representation, implemented as a + | set. If the features change, a new model must be trained.''' + | def add(name, *args): + | features.add('+'.join((name,) + tuple(args))) + + | features = set() + | add('bias') # This acts sort of like a prior + | add('i suffix', word[-3:]) + | add('i pref1', word[0]) + | add('i-1 tag', prev) + | add('i-2 tag', prev2) + | add('i tag+i-2 tag', prev, prev2) + | add('i word', context[i]) + | add('i-1 tag+i word', prev, context[i]) + | add('i-1 word', context[i-1]) + | add('i-1 suffix', context[i-1][-3:]) + | add('i-2 word', context[i-2]) + | add('i+1 word', context[i+1]) + | add('i+1 suffix', context[i+1][-3:]) + | add('i+2 word', context[i+2]) + | return features + + p. + I haven’t added any features from external data, such as case frequency + statistics from the Google Web 1T corpus. I might add those later, but for + now I figured I’d keep things simple. + + h3 What about search? + + p. + The model I’ve recommended commits to its predictions on each word, and + moves on to the next one. Those predictions are then used as features for + the next word. There’s a potential problem here, but it turns out it doesn’t + matter much. It’s easy to fix with beam-search, but I say it’s not really + worth bothering. And it definitely doesn’t matter enough to adopt a slow + and complicated algorithm like Conditional Random Fields. + + p. + Here’s the problem. The best indicator for the tag at position, say, 3 in + a sentence is the word at position 3. But the next-best indicators are the + tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want + the predictions for the surrounding words in hand before we commit to a + prediction for the current word. Here’s an example where search might matter: + + p.example. + Their management plan reforms worked + + p. + Depending on just what you’ve learned from your training data, you can + imagine making a different decision if you started at the left and moved + right, conditioning on your previous decisions, than if you’d started at + the right and moved left. + + p. + If that’s not obvious to you, think about it this way: “worked” is almost + surely a verb, so if you tag “reforms” with that in hand, you’ll have a + different idea of its tag than if you’d just come from “plan“, which you + might have regarded as either a noun or a verb. + + p. + Search can only help you when you make a mistake. It can prevent that error + from throwing off your subsequent decisions, or sometimes your future choices + will correct the mistake. And that’s why for POS tagging, search hardly matters! + Your model is so good straight-up that your past predictions are almost always + true. So you really need the planets to align for search to matter at all. + + p. + And as we improve our taggers, search will matter less and less. Instead + of search, what we should be caring about is multi-tagging. If we let the + model be a bit uncertain, we can get over 99% accuracy assigning an average + of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is + rubbish at multi-tagging though. That’s its big weakness. You really want + a probability distribution for that. + + p. + One caveat when doing greedy search, though. It’s very important that your + training data model the fact that the history will be imperfect at run-time. + Otherwise, it will be way over-reliant on the tag-history features. Because + the Perceptron is iterative, this is very easy. + + p. + Here’s the training loop for the tagger: + + pre.language-python + code + | def train(self, sentences, save_loc=None, nr_iter=5, quiet=False): + | '''Train a model from sentences, and save it at save_loc. nr_iter + | controls the number of Perceptron training iterations.''' + | self._make_tagdict(sentences, quiet=quiet) + | self.model.classes = self.classes + | prev, prev2 = START + | for iter_ in range(nr_iter): + | c = 0; n = 0 + | for words, tags in sentences: + | context = START + [self._normalize(w) for w in words] + END + | for i, word in enumerate(words): + | guess = self.tagdict.get(word) + | if not guess: + | feats = self._get_features( + | i, word, context, prev, prev2) + | guess = self.model.predict(feats) + | self.model.update(tags[i], guess, feats) + | # Set the history features from the guesses, not the + | # true tags + | prev2 = prev; prev = guess + | c += guess == tags[i]; n += 1 + | random.shuffle(sentences) + | if not quiet: + | print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n))) + | self.model.average_weights() + | # Pickle as a binary file + | if save_loc is not None: + | cPickle.dump((self.model.weights, self.tagdict, self.classes), + | open(save_loc, 'wb'), -1) + p. + Unlike the previous snippets, this one’s literal – I tended to edit the + previous ones to simplify. So if they have bugs, hopefully that’s why! + + p. + At the time of writing, I’m just finishing up the implementation before I + submit a pull request to TextBlob. You can see the rest of the source here: + + ul + li + a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py + li + a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py + + h3 A final comparison… + + p. + Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology. + The claim is that we’ve just been meticulously over-fitting our methods to this + data. Actually the evidence doesn’t really bear this out. Mostly, if a technique + is clearly better on one evaluation, it improves others as well. Still, it’s + very reasonable to want to know how these tools perform on other text. So I + ran the unchanged models over two other sections from the OntoNotes corpus: + + table + thead + tr + th Tagger + th WSJ + th ABC + th Web + tbody + tr + td Pattern + td 93.5 + td 90.7 + td 88.1 + tr + td NLTK + td 94.0 + td 91.5 + td 88.4 + tr + td PyGreedyAP + td 96.8 + td 94.8 + td 91.8 + + p. + The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t + looked at the data much). + + p. + As you can see, the order of the systems is stable across the three comparisons, + and the advantage of our Averaged Perceptron tagger over the other two is real + enough. Actually the pattern tagger does very poorly on out-of-domain text. + It mostly just looks up the words, so it’s very domain dependent. I hadn’t + realised it before, but it’s obvious enough now that I think about it. + + p. + We can improve our score greatly by training on some of the foreign data. + The technique described in this paper (Daume III, 2007) is the first thing + I try when I have to do that. + + + footer.meta(role='contentinfo') + a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter + .discuss + a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News + | + a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit From 0f2cb7443365505aff2468dccaec096ef262f64f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Aug 2015 08:56:30 +0200 Subject: [PATCH 071/138] * Work on docs --- docs/redesign/blog.jade | 64 ++-------- docs/redesign/blog_parser.jade | 2 +- docs/redesign/comparisons.jade | 191 ++++++++++++++++++++---------- docs/redesign/docs.jade | 2 +- docs/redesign/home.jade | 17 +-- docs/redesign/mixins.jade | 4 +- docs/redesign/online_demo.jade | 2 +- docs/redesign/tutorials.jade | 29 +++++ docs/redesign/usage_examples.jade | 111 ++++++++++++----- 9 files changed, 256 insertions(+), 166 deletions(-) diff --git a/docs/redesign/blog.jade b/docs/redesign/blog.jade index 119a5aad9..8a712267d 100644 --- a/docs/redesign/blog.jade +++ b/docs/redesign/blog.jade @@ -13,6 +13,7 @@ mixin Teaser(title, url, date_long, date_short, author, lede) a.readmore(href='#') ► + doctype html html(lang='en') head @@ -71,63 +72,22 @@ html(lang='en') "in syntactic parsing over the last few years. It’s now possible for a" + "tiny Python implementation to perform better than the widely-used Stanford " + "PCFG parser.") + +Teaser( + "A good Part-of-Speech tagger in about 200 lines of Python", + "blog_tagger.html", + "October 11, 2013", + "2013-09-11", + "Matthew Honnibal", + "There are a tonne of “best known techniques” for POS tagging, and you " + + "should ignore the others and just use greedy Averaged Perceptron." + ) - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - article.post - header - h2 - a(href='#') Another headline - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - .readmore - a.button(href='#') Read more posts section.intro h2 a.permalink(href='#tutorials', name='tutorials') Tutorials - p - | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est. + section.tutorials - details - summary - h4 Tutorial #1: How to do something cool - p - | The Natural Language Processing (NLP) community has made big progress in syntactic parsing over the last few years. It’s now possible for a tiny Python implementation to perform better than the widely-used Stanford PCFG parser. - a.readmore(href='#') ► - details - summary - h4 Tutorial #2 - details - summary - h4 Tutorial #3 + include ./tutorials.jade footer(role="contentinfo") span.slogan.copyright © 2015 Syllogism Co. diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade index 4930d8d26..e94376e32 100644 --- a/docs/redesign/blog_parser.jade +++ b/docs/redesign/blog_parser.jade @@ -15,7 +15,7 @@ block body_block article.post header - h2 Parsing English with 500 lines of Python + h2 Parsing English in 500 lines of Python .subhead | by a(href='#', rel='author') Matthew Honnibal diff --git a/docs/redesign/comparisons.jade b/docs/redesign/comparisons.jade index a80df8235..c4434db5c 100644 --- a/docs/redesign/comparisons.jade +++ b/docs/redesign/comparisons.jade @@ -1,78 +1,139 @@ +- var urls = {} +- urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" +- urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" + + +comparison("NLTK") + p spaCy is: + ul + li.pro 100x faster; + li.pro 50% more accurate; + li.pro Serializes TODO% smaller; + + p spaCy features: + ul + li.pro Integrated word vectors; + li.pro Efficient binary serialization; + + p NLTK features: + ul + li.con Multiple languages; + li.neutral Educational resources + + //+comparison("Pattern") +comparison("CoreNLP") + p spaCy is: + + ul + li.pro TODO% faster; + li.pro TODO% more accurate; + li.pro Not Java; + li.pro Well documented; + li.pro Cheaper to license commercially; + li.neutral + | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping + | options. + + p CoreNLP features: + + ul + li.con Multiple Languages; + li.con Sentiment analysis + li.con Coreference resolution + + +comparison("ClearNLP") -//+comparison("OpenNLP") -//+comparison("GATE") + p spaCy is: -+comparison("Accuracy Summary") + ul + li.pro Not Java; + li.pro TODO% faster; + li.pro Well documented; + li.neutral Slightly more accurate; + + p ClearNLP features: + + ul + li.con Semantic Role Labelling + li.con Multiple Languages + li.con Model for biology/life-science; + +//+comparison("Accuracy Summary") + +//+comparison("Speed Summary") +// table +// thead +// tr +// th. +// th(colspan=3) Absolute (ms per doc) +// th(colspan=3) Relative (to spaCy) +// +// tbody +// tr +// td: strong System +// td: strong Split +// td: strong Tag +// td: strong Parse +// td: strong Split +// td: strong Tag +// td: strong Parse +// +// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") +// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") +// +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") +// +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") +// +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") +// +// p +// | Set up: 100,000 plain-text documents were streamed +// | from an SQLite3 database, and processed with an NLP library, to one +// | of three levels of detail – tokenization, tagging, or parsing. +// | The tasks are additive: to parse the text you have to tokenize and +// | tag it. The pre-processing was not subtracted from the times – +// | I report the time required for the pipeline to complete. I report +// | mean times per document, in milliseconds. +// +// p +// | Hardware: Intel i7-3770 (2012) + + + + + ++comparison("Peer-reviewed Evaluations") + p. + spaCy is committed to rigorous evaluation under standard methodology. Two + papers in 2015 confirm that: + ol + li spaCy is the fastest syntactic parser in the world; + li Its accuracy is within 1% of the best available; + li The few systems that are more accurate are 20× slower or more. + + p + | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, + | as part of a survey paper benchmarking the current state-of-the-art dependency + | parsers + a(href=urls.choi_paper) (Choi et al., 2015) + | . -+comparison("Speed Summary") table thead - tr - th. - th(colspan=3) Absolute (ms per doc) - th(colspan=3) Relative (to spaCy) + +columns("System", "Language", "Accuracy", "Speed") tbody - tr - td: strong System - td: strong Split - td: strong Tag - td: strong Parse - td: strong Split - td: strong Tag - td: strong Parse - - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") - +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") - +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") - +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") + +row("spaCy v0.84", "Cython", "90.6", "13,963") + +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") + +row("ClearNLP", "Java", "91.7", "10,271") + +row("CoreNLP", "Java", "89.6", "8,602") + +row("MATE", "Java", "92.5", "550") + +row("Turbo", "C++", "92.4", "349") + +row("Yara", "Java", "92.3", "340") p - | Set up: 100,000 plain-text documents were streamed - | from an SQLite3 database, and processed with an NLP library, to one - | of three levels of detail – tokenization, tagging, or parsing. - | The tasks are additive: to parse the text you have to tokenize and - | tag it. The pre-processing was not subtracted from the times – - | I report the time required for the pipeline to complete. I report - | mean times per document, in milliseconds. + | Discussion with the authors led to accuracy improvements in spaCy, which + | have been accepted for publication in EMNLP, in joint work with Macquarie + | University + a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) + | . - p - | Hardware: Intel i7-3770 (2012) - - - +comparison("Independent Evaluation") - p - | Independent evaluation by Yahoo! Labs and Emory - | University, to appear at ACL 2015. Higher is better. - - table - thead - +columns("System", "Language", "Accuracy", "Speed") - - tbody - +row("spaCy v0.86", "Cython", "91.9", "13,963") - +row("spaCy v0.84", "Cython", "90.6", "13,963") - +row("ClearNLP", "Java", "91.7", "10,271") - +row("CoreNLP", "Java", "89.6", "8,602") - +row("MATE", "Java", "92.5", "550") - +row("Turbo", "C++", "92.4", "349") - +row("Yara", "Java", "92.3", "340") - - p - | Accuracy is % unlabelled arcs correct, speed is tokens per second. - - p - | Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) - | performed a detailed comparison of the best parsers available. - | All numbers above are taken from the pre-print they kindly made - | available to me, except for spaCy v0.86. - - p - | I'm particularly grateful to the authors for discussion of their - | results, which led to the improvement in accuracy between v0.84 and - | v0.86. A tip from Jin-ho developer of ClearNLP) was particularly - | useful. diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade index 2b5c88760..e098bb0c0 100644 --- a/docs/redesign/docs.jade +++ b/docs/redesign/docs.jade @@ -125,5 +125,5 @@ block body_block article +Section("API", "api", "api.jade") - +Section("Tutorals", "tutorials", "tutorials.jade") + +Section("Tutorials", "tutorials", "tutorials.jade") +Section("Annotation Specifications", "spec", "spec.jade") diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade index a628da2db..66efd1455 100644 --- a/docs/redesign/home.jade +++ b/docs/redesign/home.jade @@ -28,17 +28,6 @@ mixin lede If you're a small company doing NLP, we want spaCy to seem like !{a_minor_miracle}. -mixin overview() - p. - Overview text - -mixin benchmarks() - p. - Benchmarks - -mixin get_started() - p. - Get Started mixin comparison(name) details @@ -78,20 +67,22 @@ block intro_block nav(role="navigation") ul li: a(href="#example-use" class="button") Examples - li: a(href="#online-demo" class="button") Demo li: a(href="#comparisons" class="button") Comparisons + li: a(href="#online-demo" class="button") Try Online li: a(href="#install" class="button") | Install v0.89 + block body_block article(class="page landing-page") +Section("Usage by Example", "example-use", "./usage_examples.jade") + +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") + +Section("Online Demo", "online-demo", "./online_demo.jade") - +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") +Section("Install", "install", "./install.jade") diff --git a/docs/redesign/mixins.jade b/docs/redesign/mixins.jade index 34ad293aa..005149a2b 100644 --- a/docs/redesign/mixins.jade +++ b/docs/redesign/mixins.jade @@ -1,5 +1,5 @@ mixin Section(title_text, link_name, include_file) - h3: a(name=link_name href=link_name) #{title_text} + h3: a(name=link_name) #{title_text} if (link_name == "example-use") include ./usage_examples.jade @@ -15,5 +15,3 @@ mixin Section(title_text, link_name, include_file) include ./tutorials.jade else if (link_name == "spec") include ./spec.jade - - diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade index 0e2bbb331..92a61eefc 100644 --- a/docs/redesign/online_demo.jade +++ b/docs/redesign/online_demo.jade @@ -5,7 +5,7 @@ mixin Displacy(sentence, caption_text, height) iframe.displacy(src="displacy/displacy_demo.html" height=height) a.view-displacy(href=url) - | View on displaCy + | Interactive Visualizer p.caption. #{caption_text} diff --git a/docs/redesign/tutorials.jade b/docs/redesign/tutorials.jade index e69de29bb..ad1a4dbc9 100644 --- a/docs/redesign/tutorials.jade +++ b/docs/redesign/tutorials.jade @@ -0,0 +1,29 @@ +mixin Tutorial(title) + details + summary + h4= title + + block + ++Tutorial("Mark-up all manner adverbs, especially for verbs of speech") + | Let's say you're developing a proofreading tool, or possibly an IDE for + | writers. You're convinced by Stephen King's advice that + | adverbs are not your friend + | so you want to + a.readmore(href='tute_adverbs.html') + | highlight all adverbs. ► + ++Tutorial("Search Reddit for comments about Google doing something") + | Example use of the spaCy NLP tools for data exploration. + | Here we will look for Reddit comments that describe Google doing something, + | i.e. discuss the company's actions. This is difficult, because other + | senses of "Google" now dominate usage of the word in conversation, + | particularly references to using Google products. + a.readmore(href='tute_adverbs.html') + | ► + ++Tutorial("Use word vectors for semantic search of Twitter") + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. + | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. + a.readmore(href='tute_twitter.html') + | ► diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade index 418ca9c57..04f29eeb9 100644 --- a/docs/redesign/usage_examples.jade +++ b/docs/redesign/usage_examples.jade @@ -23,7 +23,7 @@ mixin example(name) | hello_id = nlp.vocab.strings['Hello'] | hello_str = nlp.vocab.strings[hello_id] | - | assert token.orth == hello_id == 52 + | assert token.orth == hello_id == 52 | assert token.orth_ == hello_str == 'Hello' +example("Get and set string views and flags") @@ -66,51 +66,102 @@ mixin example(name) +example("Part-of-speech tags") pre.language-python: code - | doc[0].pos - | doc[0].tag + | from spacy.parts_of_speech import ADV + | + | def is_adverb(token): + | return token.pos == spacy.parts_of_speech.ADV + | + | # These are data-specific, so no constants are provided. You have to look + | # up the IDs from the StringStore. + | NNS = nlp.vocab.strings['NNS'] + | NNPS = nlp.vocab.strings['NNPS'] + | def is_plural_noun(token): + | return token.tag == NNS or token.tag == NNPS + | + | def print_coarse_pos(token): + | print(token.pos_) + | + | def print_fine_pos(token): + | print(token.tag_) +example("Syntactic dependencies") pre.language-python: code - | for head in tokens: - | for child in head.lefts: - | assert child.head is head - | for child in head.rights: - | assert child.head is head - | sent = nlp('The four wheels on the bus turned quickly.') - | wheels = sent[2] - | bus = sent[5] - | assert len(list(wheels.lefts)) == 2 - | assert len(list(wheels.rights)) == 1 - | assert len(list(wheels.children)) == 3 - | assert len(list(bus.lefts)) == 1 - | assert len(list(bus.rights)) == 0 - | assert len(list(bus.children)) == 1 - | - | assert len(list(wheels.subtree)) == 6 + | def dependency_labels_to_root(token): + | '''Walk up the syntactic tree, collecting the arc labels.''' + | dep_labels = [] + | while token.root is not token: + | dep_labels.append(token.dep) + | token = token.head + | return dep_labels +example("Named entities") pre.language-python: code - | doc.ents - | token.ent_type - | token.ent_iob + | def iter_products(docs): + | for doc in docs: + | for ent in doc.ents: + | if ent.label_ == 'PRODUCT': + | yield ent + | + | def word_is_in_entity(word): + | return word.ent_type != 0 + | + | def count_parent_verb_by_person(docs): + | counts = defaultdict(defaultdict(int)) + | for doc in docs: + | for ent in doc.ents: + | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: + | counts[ent.orth_][ent.root.head.lemma_] += 1 + | return counts + + //+example("Define custom NER rules") + // pre.language-python: code + // | nlp.matcher -+example("Define custom NER rules") - pre.language-python: code - | nlp.matcher +example("Calculate inline mark-up on original string") pre.language-python: code - | token.string - | token.spacy - | token.whitespace_ + | def put_spans_around_tokens(doc, get_classes): + | '''Given some function to compute class names, put each token in a + | span element, with the appropriate classes computed. + | + | All whitespace is preserved, outside of the spans. (Yes, I know HTML + | won't display it. But the point is no information is lost, so you can + | calculate what you need, e.g.
tags,

tags, etc.) + | ''' + | output = [] + | template = '<span classes="{classes}">{word}</span>{space}' + | for token in doc: + | if token.is_space: + | output.append(token.orth_) + | else: + | output.append( + | template.format( + | classes=' '.join(get_classes(token)), + | word=token.orth_, + | space=token.whitespace_)) + | string = ''.join(output) + | string = string.replace('\n', '
') + | string = string.replace('\t', '    ' + | return string + +example("Efficient binary serialization") pre.language-python: code - + | | byte_string = doc.as_bytes() | open('/tmp/moby_dick.bin', 'wb').write(byte_string) - + | | nlp = spacy.en.English() | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): | doc = Doc(nlp.vocab) | doc.from_bytes(byte_string) + + +p + | See the + a(href="docs.html") docs page + | for + a(href="docs.html#api") API documentation, + a(href="docs.html#tutorials") tutorials, + | and + a(href="docs.html#spec") annotation specs. From cad0cca4e3b7c50f45e1e1084d7d3d2fbc6db7ae Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:04:34 +0200 Subject: [PATCH 072/138] * Tmp --- spacy/en/__init__.py | 1 - spacy/lexeme.pxd | 126 ++++++++++++++----------------------------- spacy/lexeme.pyx | 109 ++++++++++++++++++++++++------------- spacy/matcher.pyx | 34 ++++++------ spacy/strings.pyx | 5 ++ spacy/vocab.pyx | 45 +++++----------- 6 files changed, 147 insertions(+), 173 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index c81630a72..a04b615da 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -80,7 +80,6 @@ class English(object): Packer=None, load_vectors=True ): - self.data_dir = data_dir if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f7b210281..321f7c616 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -8,97 +8,53 @@ from .strings cimport StringStore from numpy cimport ndarray - cdef LexemeC EMPTY_LEXEME - -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore strings, - const float* empty_vec) except -1 - cdef class Lexeme: - cdef readonly ndarray repvec - - cdef readonly flags_t flags - cdef readonly attr_t id - cdef readonly attr_t length - + cdef LexemeC* c + cdef readonly Vocab vocab cdef readonly attr_t orth - cdef readonly attr_t lower - cdef readonly attr_t norm - cdef readonly attr_t shape - cdef readonly attr_t prefix - cdef readonly attr_t suffix - cdef readonly unicode orth_ - cdef readonly unicode lower_ - cdef readonly unicode norm_ - cdef readonly unicode shape_ - cdef readonly unicode prefix_ - cdef readonly unicode suffix_ + cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + lex.length = props['length'] + lex.orth = vocab.strings[props['orth']] + lex.lower = vocab.strings[props['lower']] + lex.norm = vocab.strings[props['norm']] + lex.shape = vocab.strings[props['shape']] + lex.prefix = vocab.strings[props['prefix']] + lex.suffix = vocab.strings[props['suffix']] - cdef readonly attr_t cluster - cdef readonly float prob - cdef readonly float sentiment - cdef readonly float l2_norm + lex.cluster = props['cluster'] + lex.prob = props['prob'] + lex.sentiment = props['sentiment'] + + lex.flags = props['flags'] + lex.repvec = empty_vec - # Workaround for an apparent bug in the way the decorator is handled --- - # TODO: post bug report / patch to Cython. @staticmethod - cdef inline Lexeme from_ptr(const LexemeC* ptr, StringStore strings, int repvec_length): - cdef Lexeme py = Lexeme.__new__(Lexeme, repvec_length) - for i in range(repvec_length): - py.repvec[i] = ptr.repvec[i] - py.l2_norm = ptr.l2_norm - py.flags = ptr.flags - py.id = ptr.id - py.length = ptr.length + cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: + if feat_name < (sizeof(flags_t) * 8): + return Lexeme.check_flag(lex, feat_name) + elif feat_name == ID: + return lex.id + elif feat_name == ORTH: + return lex.orth + elif feat_name == LOWER: + return lex.lower + elif feat_name == NORM: + return lex.norm + elif feat_name == SHAPE: + return lex.shape + elif feat_name == PREFIX: + return lex.prefix + elif feat_name == SUFFIX: + return lex.suffix + elif feat_name == LENGTH: + return lex.length + elif feat_name == CLUSTER: + return lex.cluster + else: + return 0 - py.orth = ptr.orth - py.lower = ptr.lower - py.norm = ptr.norm - py.shape = ptr.shape - py.prefix = ptr.prefix - py.suffix = ptr.suffix - - py.orth_ = strings[ptr.orth] - py.lower_ = strings[ptr.lower] - py.norm_ = strings[ptr.norm] - py.shape_ = strings[ptr.shape] - py.prefix_ = strings[ptr.prefix] - py.suffix_ = strings[ptr.suffix] - - py.cluster = ptr.cluster - py.prob = ptr.prob - py.sentiment = ptr.sentiment - return py - - cpdef bint check_flag(self, attr_id_t flag_id) except -1 - - -cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) - - -cdef inline attr_t get_attr(const LexemeC* lex, attr_id_t feat_name) nogil: - if feat_name < (sizeof(flags_t) * 8): - return check_flag(lex, feat_name) - elif feat_name == ID: - return lex.id - elif feat_name == ORTH: - return lex.orth - elif feat_name == LOWER: - return lex.lower - elif feat_name == NORM: - return lex.norm - elif feat_name == SHAPE: - return lex.shape - elif feat_name == PREFIX: - return lex.prefix - elif feat_name == SUFFIX: - return lex.suffix - elif feat_name == LENGTH: - return lex.length - elif feat_name == CLUSTER: - return lex.cluster - else: - return 0 + cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: + return lexeme.flags & (1 << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 07f151114..f0b3303f1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -17,70 +17,105 @@ from .attrs cimport IS_OOV memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) -cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store, - const float* empty_vec) except -1: - lex.length = props['length'] - lex.orth = string_store[props['orth']] - lex.lower = string_store[props['lower']] - lex.norm = string_store[props['norm']] - lex.shape = string_store[props['shape']] - lex.prefix = string_store[props['prefix']] - lex.suffix = string_store[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] - lex.repvec = empty_vec - - cdef class Lexeme: """An entry in the vocabulary. A Lexeme has no string context --- it's a word-type, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). """ - def __cinit__(self, int vec_size): - self.repvec = numpy.ndarray(shape=(vec_size,), dtype=numpy.float32) + def __init__(self, Vocab vocab, int orth): + self.vocab = vocab + self.orth = orth + self.c = vocab.get_by_orth(orth) - @property - def has_repvec(self): - return self.l2_norm != 0 + property orth: + def __get__(self): + return self.c.orth + + property lower: + def __get__(self): return self.c.lower + def __set__(self, int x): self.c.lower = x + + property norm: + def __get__(self): return self.c.norm + def __set__(self, int x): self.c.norm = x - cpdef bint check_flag(self, attr_id_t flag_id) except -1: - cdef flags_t one = 1 - return self.flags & (one << flag_id) + property shape: + def __get__(self): return self.c.shape + def __set__(self, int x): self.c.shape = x + + property prefix: + def __get__(self): return self.c.prefix + def __set__(self, int x): self.c.prefix = x + + property suffix: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] + + property lower_: + def __get__(self): return self.vocab.strings[self.c.lower] + def __set__(self, unicode x): self.c.lower = self.vocab.strings[x] + + property norm_: + def __get__(self): return self.c.norm + def __set__(self, unicode x): self.c.norm = self.vocab.strings[x] + + property shape_: + def __get__(self): return self.vocab.strings[self.c.shape] + def __set__(self, unicode x): self.c.shape = self.vocab.strings[x] + + property prefix_: + def __get__(self): return self.c.prefix + def __set__(self, unicode x): self.c.prefix = self.vocab.strings[x] + + property suffix_: + def __get__(self): return self.c.suffix + def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] property is_oov: - def __get__(self): return self.check_flag(IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) property is_alpha: - def __get__(self): return self.check_flag(IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) property is_ascii: - def __get__(self): return self.check_flag(IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) property is_digit: - def __get__(self): return self.check_flag(IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) property is_lower: - def __get__(self): return self.check_flag(IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) property is_title: - def __get__(self): return self.check_flag(IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) property is_punct: - def __get__(self): return self.check_flag(IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) property is_space: - def __get__(self): return self.check_flag(IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) property like_url: - def __get__(self): return self.check_flag(LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return self.check_flag(LIKE_NUM) + def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: - def __get__(self): return self.check_flag(LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) + def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index ee2ceaecc..72473b073 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -12,6 +12,8 @@ from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab +from libcpp.vector cimport vector + try: import ujson as json except ImportError: @@ -96,28 +98,26 @@ def map_attr_name(attr): cdef class Matcher: cdef Pool mem - cdef Pattern** patterns + cdef vector[Pattern*] patterns cdef readonly int n_patterns def __init__(self, vocab, patterns): self.mem = Pool() - n_patterns = sum([len(specs) for etype, attrs, specs in patterns.values()]) - self.patterns = self.mem.alloc(n_patterns, sizeof(Pattern*)) - cdef int i = 0 for entity_key, (etype, attrs, specs) in sorted(patterns.items()): - if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] - if isinstance(etype, basestring): - etype = vocab.strings[etype] - elif etype is None: - etype = -1 - # TODO: Do something more clever about multiple patterns for single - # entity - for spec in specs: - spec = _convert_strings(spec, vocab.strings) - self.patterns[i] = init_pattern(self.mem, spec, etype) - i += 1 - self.n_patterns = len(patterns) + self.add(entity_key, etype, attrs, specs) + + def add(self, entity_key, etype, attrs, specs): + if isinstance(entity_key, basestring): + entity_key = vocab.strings[entity_key] + if isinstance(etype, basestring): + etype = vocab.strings[etype] + elif etype is None: + etype = -1 + # TODO: Do something more clever about multiple patterns for single + # entity + for spec in specs: + spec = _convert_strings(spec, vocab.strings) + self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod def from_dir(cls, vocab, data_dir): diff --git a/spacy/strings.pyx b/spacy/strings.pyx index b35ed2ccb..c187a6aa6 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -108,6 +108,11 @@ cdef class StringStore: else: raise TypeError(type(string_or_id)) + def __iter__(self): + cdef int i + for i in range(self.size): + yield self[i] + cdef const Utf8Str* intern(self, unsigned char* chars, int length) except NULL: # 0 means missing, but we don't bother offsetting the index. key = hash64(chars, length * sizeof(char), 0) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac2e11e11..dcb7d575c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -36,24 +36,20 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_props=None, load_vectors=True, - pos_tags=None, oov_prob=-30): - if oov_prob is None: - oov_prob = -30 + def __init__(self, data_dir=None, get_lex_attr=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() self.pos_tags = pos_tags if pos_tags is not None else {} - - self.lexeme_props_getter = get_lex_props + + self.get_lex_attr = get_lex_attr self.repvec_length = 0 self.length = 0 self._add_lex_to_vocab(0, &EMPTY_LEXEME) if data_dir is not None: if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if data_dir is not None: if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) self.load_lexemes(path.join(data_dir, 'strings.txt'), @@ -63,7 +59,6 @@ cdef class Vocab: self._serializer = None self.data_dir = data_dir - self.oov_prob = oov_prob property serializer: def __get__(self): @@ -91,18 +86,8 @@ cdef class Vocab: lex = self._by_hash.get(key) if lex != NULL: return lex - cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem - lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - if is_oov: - lex.id = 0 else: - self._add_lex_to_vocab(key, lex) - assert lex != NULL, string - return lex + return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme @@ -114,18 +99,21 @@ cdef class Vocab: lex = self._by_orth.get(orth) if lex != NULL: return lex - cdef unicode string = self.strings[orth] + else: + return self._new_lexeme(mem, self.strings[orth]) + + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: cdef bint is_oov = mem is not self.mem if len(string) < 3: mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - props = self.lexeme_props_getter(string, self.oov_prob, is_oov=is_oov) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) + for attr, func in self.lex_attr_getters.items(): + Lexeme.set_struct_attr(lex, attr, func(string)) if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(hash_string(string), lex) - assert lex != NULL, orth + self._add_lex_to_vocab(key, lex) + assert lex != NULL, string return lex cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: @@ -171,15 +159,6 @@ cdef class Vocab: "int --> Lexeme" % str(type(id_or_string))) return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) - def __setitem__(self, unicode string, dict props): - cdef hash_t key = hash_string(string) - cdef LexemeC* lex - lex = self._by_hash.get(key) - if lex == NULL: - lex = self.mem.alloc(sizeof(LexemeC), 1) - set_lex_struct_props(lex, props, self.strings, EMPTY_VEC) - self._add_lex_to_vocab(key, lex) - def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) From 890d6aa21686be72559380478cf85aaed0c502ac Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:06:30 +0200 Subject: [PATCH 073/138] * Remove old docs --- docs/redesign/api.jade | 661 ------------------ docs/redesign/blog.jade | 95 --- docs/redesign/blog_intro.jade | 81 --- docs/redesign/blog_parser.jade | 938 -------------------------- docs/redesign/blog_tagger.jade | 492 -------------- docs/redesign/change_log.jade | 0 docs/redesign/comparisons.jade | 139 ---- docs/redesign/docs.jade | 129 ---- docs/redesign/home.jade | 88 --- docs/redesign/installation.jade | 71 -- docs/redesign/license.jade | 179 ----- docs/redesign/mixins.jade | 17 - docs/redesign/online_demo.jade | 18 - docs/redesign/outline.jade | 37 - docs/redesign/spec.jade | 129 ---- docs/redesign/template_post.jade | 31 - docs/redesign/tute_adverbs.jade | 200 ------ docs/redesign/tute_syntax_search.jade | 132 ---- docs/redesign/tute_twitter.jade | 204 ------ docs/redesign/tutorials.jade | 29 - docs/redesign/usage_examples.jade | 167 ----- 21 files changed, 3837 deletions(-) delete mode 100644 docs/redesign/api.jade delete mode 100644 docs/redesign/blog.jade delete mode 100644 docs/redesign/blog_intro.jade delete mode 100644 docs/redesign/blog_parser.jade delete mode 100644 docs/redesign/blog_tagger.jade delete mode 100644 docs/redesign/change_log.jade delete mode 100644 docs/redesign/comparisons.jade delete mode 100644 docs/redesign/docs.jade delete mode 100644 docs/redesign/home.jade delete mode 100644 docs/redesign/installation.jade delete mode 100644 docs/redesign/license.jade delete mode 100644 docs/redesign/mixins.jade delete mode 100644 docs/redesign/online_demo.jade delete mode 100644 docs/redesign/outline.jade delete mode 100644 docs/redesign/spec.jade delete mode 100644 docs/redesign/template_post.jade delete mode 100644 docs/redesign/tute_adverbs.jade delete mode 100644 docs/redesign/tute_syntax_search.jade delete mode 100644 docs/redesign/tute_twitter.jade delete mode 100644 docs/redesign/tutorials.jade delete mode 100644 docs/redesign/usage_examples.jade diff --git a/docs/redesign/api.jade b/docs/redesign/api.jade deleted file mode 100644 index 0bc956ce1..000000000 --- a/docs/redesign/api.jade +++ /dev/null @@ -1,661 +0,0 @@ -mixin declare_class(name) - details - summary - span.declaration - span.label class - code #{name} - block - -mixin method(name, parameters) - details(open=attributes.open) - summary - span.declaration - span.label #{name} - span.parameters - | self, #{parameters} - block - - -mixin params - ul - block - - -mixin param(name, type, value) - li - if type - #{name} (!{type}) – - else - #{name} – - block - - -mixin attribute(name, type, value) - details(open=attributes.open) - summary - span.declaration - span.label #{name} - block - - -mixin returns(name, type, value) - li - if type - #{name} (!{type}) – - else - #{name} – - block - - -mixin returns(type) - | tmp - -mixin init - details - summary: h4 Init - - block - - -mixin callable - details - summary: h4 Callable - - block - - -mixin sequence - details - summary: h4 Sequence - - block - - -mixin maptype - details - summary: h4 Map - - block - - -mixin summary - block - -mixin en_example - pre.language-python - code - | from spacy.en import English - | from spacy._doc_examples import download_war_and_peace - | - | unprocessed_unicode = download_war_and_peace() - | - | nlp = English() - | doc = nlp(unprocessed_unicode) - - -+declare_class("English") - p Load models into a callable object to process English text. - - +summary - +en_example - - +init - p - | Load the resources. Loading takes 20 seconds, and the instance - | consumes 2 to 3 gigabytes of memory. - - p - | Intended use is for one instance to be created per process. - | You can create more if you're doing something unusual. - p - | You may wish to make the instance a global variable or "singleton". - | We usually instantiate the object in the main() - | function and pass it around as an explicit argument. - +method("__init__", "data_dir=True, Tagger=True, Parser=True, Entity=True, Matcher=True, Packer=None, load_vectors=True")(open="true") - - +params - +param("data_dir") - | The data directory. May be #{None}, to disable any data loading - | (including the vocabulary). - - +param("Tokenizer") - | A class/function that creates the tokenizer. - - +param("Tagger") - | A class/function that creates the part-of-speech tagger. - - +param("Parser") - | A class/function that creates the dependency parser. - - +param("Entity") - | A class/function that creates the named entity recogniser. - - +param("load_vectors") - | A boolean value to control whether the word vectors are loaded. - - +callable - +method("__call__", "text, tag=True, parse=True, entity=True") - - +params - +param("text", types.unicode) - | The text to be processed. No pre-processing needs to be applied, - | and any length of text can be submitted. Usually you will submit - | a whole document. Text may be zero-length. An exception is raised - | if byte strings are supplied. - - +param("tag", types.bool) - | Whether to apply the part-of-speech tagger. Required for parsing - | and entity recognition. - - +param("parse", types.bool) - | Whether to apply the syntactic dependency parser. - - +param("entity", types.bool) - | Whether to apply the named entity recognizer. - - pre.language-python - code - | from spacy.en import English - | nlp = English() - | doc = nlp(u'Some text.) # Applies tagger, parser, entity - | doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - | doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - | doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - | doc = nlp(u'') # Zero-length tokens, not an error - | # doc = nlp(b'Some text') <-- Error: need unicode - | doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - - -+declare_class("Doc") - p I'm a doc - - +init - +method("__init__", "vocab") - +params - +param("vocab", vocab_type) - | A vocabulary object - - +sequence - +method("__getitem__", "i", types.int) - +returns(types.Token) - - +method("__getitem__", "start_end", types.slice) - +returns(types.Span) - - +method("__iter__") - | Iterate over tokens - - +method("__len__") - | Number of tokens in the document. - - details - summary: h4 Spans - - +attribute("sents", types.generator) - | Iterate over sentences in the document. - - +attribute("ents", types.generator) - | Iterate over named entities in the document. - - +attribute("noun_chunks", types.generator) - - details - summary: h4 Export/Import - - +method("to_array", "attr_ids") - - | Given a list of M attribute IDs, export the tokens to a numpy ndarray - | of shape N*M, where N is the length of the sentence. - - +params - +param("attr_ids", "list[int]") - | A list of attribute ID ints. - - +returns("feat_array") - | A feature matrix, with one row per word, and one column per attribute - | indicated in the input attr_ids. - - +method("count_by", "attr_id") - | Produce a dict of {attribute (int): count (ints)} frequencies, keyed - | by the values of the given attribute ID. - - pre.language-python - code - | >>> from spacy.en import English, attrs - | >>> nlp = English() - | >>> tokens = nlp(u'apple apple orange banana') - | >>> tokens.count_by(attrs.ORTH) - | {12800L: 1, 11880L: 2, 7561L: 1} - | >>> tokens.to_array([attrs.ORTH]) - | array([[11880], - | [11880], - | [7561], - | [12800]]) - - +method("from_array", "attrs, array") - | Load from array - - +method("from_bytes") - | Deserialize, loading from bytes - - +method("read_bytes") - | classmethod - - //+method("merge", "int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type") - - // | Merge a multi-word expression into a single token. Currently - // | experimental; API is likely to change. - - -+declare_class("Token") - +init - +method("__init__", "vocab, doc, offset") - +params - +param("vocab", types.Vocab) - p A Vocab object - - +param("doc", types.Doc) - p The parent sequence - - +param("offset", types.int) - p The index of the token within the document - - details - summary: h4 String Views - - +attribute("orth / orth_") - | The form of the word with no string normalization or processing, as - | it appears in the string, without trailing whitespace. - - +attribute("lemma / lemma_") - | The "base" of the word, with no inflectional suffixes, e.g. the lemma of - | "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - | derivational suffixes are not stripped, e.g. the lemma of - | "instutitions" is "institution", not "institute". Lemmatization is - | performed using the WordNet data, but extended to also cover closed-class - | words such as pronouns. By default, the WN lemmatizer returns "hi" - | as the lemma of "his". We assign pronouns the lemma -PRON-. - - +attribute("lower / lower_") - | The form of the word, but forced to lower-case, i.e. - pre.language-python: code lower = word.orth\_.lower() - - //+attribute("norm / norm_") - // | The form of the word, after language-specific normalizations has been - // | applied. - - +attribute("shape / shape_") - | A transform of the word's string, to show orthographic features. - | The characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped - | to d. After these mappings, sequences of 4 or more of the same character - | are truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - | :) --> :) - - +attribute("prefix / prefix_") - | A length-N substring from the start of the word. Length may vary by - | language; currently for English n=1, i.e. - pre.language-python: code prefix = word.orth\_[:1] - - +attribute("suffix / suffix_") - | A length-N substring from the end of the word. Length may vary by - | language; currently for English n=3, i.e. - pre.language-python: code suffix = word.orth\_[-3:] - - //+attribute("lex_id") - // | lex_id - - details - summary: h4 Alignment and Output - - +attribute("idx") - p Start index of the token in the string - - +method("__len__", "") - p Length of the token's orth string, in unicode code-points. - - +method("__unicode__", "") - p Same as token.orth_ - - +method("__str__", "") - p Varies between Python 2 and Python 3 - - +attribute("string") - p - | The form of the word as it appears in the string, including - | trailing whitespace. This is useful when you need to use - | linguistic features to add inline mark-up to the string. - - +method("nbor, i=1") - +params - +param("i") - p Offset relative to token - - details - summary: h4 Distributional Features - - +attribute("repvec") - p - | A "word embedding" representation: a dense real-valued vector that supports - | similarity queries between words. By default, spaCy currently loads - | vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - | model. - - +attribute("cluster") - p - | The Brown cluster ID of the word. These are often useful features for - | linear models. If you're using a non-linear model, particularly a - | neural net or random forest, consider using the real-valued word - | representation vector, in Token.repvec, instead. - - +attribute("prob") - p - | The unigram log-probability of the word, estimated from counts from a - | large corpus, smoothed using Simple Good Turing estimation. - - details - summary: h4 Syntactic Tags - - +attribute("pos / pos_") - p - | A part-of-speech tag, from the Google Universal Tag Set, e.g. - | code>NOUN, VERB, ADV. Constants for - | the 17 tag values are provided in spacy.parts_of_speech. - - +attribute("tag / tag_") - p - | A morphosyntactic tag, e.g. NN, VBZ, - | DT, etc. These tags are language/corpus specific, and - | typically describe part-of-speech and some amount of morphological - | information. For instance, in the Penn Treebank tag set, VBZ - | is assigned to a present-tense singular verb. - - +attribute("dep / dep_") - p - | The type of syntactic dependency relation between the word and its - | syntactic head. - - details - summary: h4 Navigating the Parse Tree - - +attribute("head") - p - | The Token that is the immediate syntactic head of the word. If the - | word is the root of the dependency tree, the same word is returned. - - +attribute("lefts") - p - | An iterator for the immediate leftward syntactic children of the - | word. - - +attribute("rights") - p - | An iterator for the immediate rightward syntactic children of the - | word. - - +attribute("n_lefts") - p - | The number of immediate syntactic children preceding the word in - | the string. - - +attribute("n_rights") - p - | The number of immediate syntactic children following the word in - | the string. - - +attribute("children") - p - | An iterator that yields from lefts, and then yields from rights. - - +attribute("subtree") - p - | An iterator for the part of the sentence syntactically governed by - | the word, including the word itself. - - +attribute("left_edge") - p The leftmost edge of the token's subtree - - +attribute("right_edge") - p The rightmost edge of the token's subtree - - details - summary: h4 Named Entities - - +attribute("ent_type") - p If the token is part of an entity, its entity type. - - +attribute("ent_iob") - p The IOB (inside, outside, begin) entity recognition tag for the token. - - details - summary: h4 Lexeme Flags - - +method("check_flag", "flag_id") - +params - +param("flag_id") - | flag ID - - +attribute("is_oov") - +attribute("is_alpha") - +attribute("is_ascii") - +attribute("is_digit") - +attribute("is_lower") - +attribute("is_title") - +attribute("is_punct") - +attribute("is_space") - +attribute("like_url") - +attribute("like_num") - +attribute("like_email") - - //+attribute("conjuncts") - // | Conjuncts - -+declare_class("Span") - +init - +method("__init__") - Temp - - span = doc[0:4] - - +sequence - +method("__getitem__") - p Get item - - +method("__iter__") - p Iter - - +method("__len__") - p Len - - details - summary: h4 Parse - - +attribute("root") - p Syntactic head - - +attribute("lefts") - p Tokens that are: - ol - li To the left of the span; - li Syntactic children of words within the span - - p i.e. - - pre.language-python - code - | lefts = [span.doc[i] for i in range(0, span.start) - | if span.doc[i].head in span] - - +attribute("rights") - p Tokens that are: - ol - li To the right of the span; - li Syntactic children of words within the span - p i.e. - pre.language-python - code - | rights = [span.doc[i] for i in range(span.end, len(span.doc)) - | if span.doc[i].head in span] - - - +attribute("subtree") - p String - - details - summary: h4 String Views - - +attribute("string") - p String - - +attribute("lemma / lemma_") - p String - - +attribute("label / label_") - p String - -+declare_class("Lexeme") - p - | The Lexeme object represents a lexical type, stored in the vocabulary - | – as opposed to a token, occurring in a document. - p - | Lexemes store various features, so that these features can be computed - | once per type, rather than once per token. As job sizes grow, this - | can amount to a substantial efficiency improvement. - - p - | All Lexeme attributes are therefore context independent, as a single - | lexeme is reused for all usages of that word. Lexemes are keyed by - | the “orth” attribute. - - p - All Lexeme attributes are accessible directly on the Token object. - - +init - +method("__init__") - p Init - - details - summary: h4 String Features - - +attribute("orth / orth_") - p - | The form of the word with no string normalization or processing, - | as it appears in the string, without trailing whitespace. - - +attribute("lower / lower_") - p Tmp - - +attribute("norm / norm_") - p Tmp - - +attribute("shape / shape_") - p Tmp - - +attribute("prefix / prefix_") - p Tmp - - +attribute("suffix / suffix_") - p TMP - -+declare_class("Vocab", "data_dir=None, lex_props_getter=None") - +sequence - +method("__len__") - +returns - p Number of words in the vocabulary. - - +method("__iter__") - +returns - p Lexeme - - +maptype - +method("__getitem__", "key_int") - +params - +param("key") - p Integer ID - - +returns: p A Lexeme object - - +method("__getitem__", "key_str") - +params - +param("key_str", types.unicode) - p A string in the vocabulary - - +returns("Lexeme") - - +method("__setitem__", "orth_str", "props") - +params - +param("orth_str", types.unicode) - p The orth key - - +param("props", types.dict) - p A props dictionary - - +returns("None") - - details - summary: h4 Import/Export - - +method("dump", "loc") - +params - +param("loc", types.unicode) - p Path where the vocabulary should be saved - - +method("load_lexemes", "loc") - +params - +param("loc", types.unicode) - p Path to load the lexemes.bin file from - - +method("load_vectors", "loc") - +params - +param("loc", types.unicode) - p Path to load the vectors.bin from - -+declare_class("StringStore") - +init - Tmp - - +sequence - +method("__len__") - +returns("int") - p Number of strings in the string-store - - +method("__iter__") - +returns - p Lexeme - - +maptype - +method("__getitem__", "key_int") - +params - +param("key_int") - p An integer key - - +returns(types.unicode) - p The string that the integer key maps to - - +method("__getitem__", "key_unicode") - +params - +param("key_unicode") - p A key, as a unicode string - - +returns(types.int) - p The integer ID of the string. - - +method("__getitem__", "key_utf8_bytes") - +params - +param("key_utf8_bytes", types.bytes) - p p A key, as a UTF-8 encoded byte-string - - +returns(types.int) - p The integer ID of the string. - - details - summary: h4 Import/Export - - +method("dump", "loc") - +params - +param("loc") - p File path to save the strings.txt to. - - +method("load") - +params - +param("loc") - p File path to load the strings.txt from. diff --git a/docs/redesign/blog.jade b/docs/redesign/blog.jade deleted file mode 100644 index 8a712267d..000000000 --- a/docs/redesign/blog.jade +++ /dev/null @@ -1,95 +0,0 @@ -mixin Teaser(title, url, date_long, date_short, author, lede) - article.post - header - h2 - a(href=url)= title - .subhead - | by - a(href='#', rel='author')= author - | on - time(datetime=date_short)= date_long - p!= lede -   - a.readmore(href='#') ► - - - -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - - nav(role="navigation") - ul - li: a(href="home.html") Home - li: a(href="docs.html") Docs - li.active: a(href="blog.html") Blog - li: a(href="license.html") License - - main#content(role='main') - section.intro.profile - p - img(src='img/matt.png') - | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. - span.social - a(href='#') Follow me on Twitter - nav(role='navigation') - ul - li - a.button(href='#') Blog - li - a.button(href='#tutorials') Tutorials - section.blogs - +Teaser( - "Introducing spaCy", - "blog_intro.html", - "February 2015", - "2015-02-18", - "Matthew Honnibal", - "spaCy is a new library for text processing in Python " + - "and Cython. I wrote it because I think small companies are terrible at " + - "natural language processing (NLP). Or rather: small companies are using " + - "terrible NLP technology." - ) - - +Teaser( - "Parsing English with 500 lines of Python", - "blog_parser.html", - "December 18, 2013", - "2013-12-18", - "Matthew Hannibal", - "The Natural Language Processing (NLP) community has made big progress" + - "in syntactic parsing over the last few years. It’s now possible for a" + - "tiny Python implementation to perform better than the widely-used Stanford " + - "PCFG parser.") - +Teaser( - "A good Part-of-Speech tagger in about 200 lines of Python", - "blog_tagger.html", - "October 11, 2013", - "2013-09-11", - "Matthew Honnibal", - "There are a tonne of “best known techniques” for POS tagging, and you " + - "should ignore the others and just use greedy Averaged Perceptron." - ) - - section.intro - h2 - a.permalink(href='#tutorials', name='tutorials') Tutorials - - section.tutorials - include ./tutorials.jade - - footer(role="contentinfo") - span.slogan.copyright © 2015 Syllogism Co. - - script(src='js/prism.js') diff --git a/docs/redesign/blog_intro.jade b/docs/redesign/blog_intro.jade deleted file mode 100644 index 15112f587..000000000 --- a/docs/redesign/blog_intro.jade +++ /dev/null @@ -1,81 +0,0 @@ -extends ./template_post.jade - -- - var urls = { - 'pos_post': 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/', - 'google_ngrams': "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html", - 'implementation': 'https://gist.github.com/syllog1sm/10343947', - 'redshift': 'http://github.com/syllog1sm/redshift', - 'tasker': 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm', - 'acl_anthology': 'http://aclweb.org/anthology/', - 'share_twitter': 'http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal' - } - -- var my_research_software = 'my research software' - -- var how_to_write_a_POS_tagger = 'how to write a part-of-speech tagger' - -- var parser_lnk = 'parser' - -- var buy_a_commercial_license = 'buy a commercial license' - - -block body_block - article.post - p. - spaCy is a new library for text processing in Python - and Cython. I wrote it because I think small companies are terrible at - natural language processing (NLP). Or rather: small companies are using - terrible NLP technology. - - p. - To do great NLP, you have to know a little about linguistics, a lot - about machine learning, and almost everything about the latest research. - The people who fit this description seldom join small companies. - Most are broke – they've just finished grad school. - If they don't want to stay in academia, they join Google, IBM, etc. - - p. - The net result is that outside of the tech giants, commercial NLP has - changed little in the last ten years. In academia, it's changed entirely. - Amazing improvements in quality. Orders of magnitude faster. But the - academic code is always GPL, undocumented, unuseable, or all three. - You could implement the ideas yourself, but the papers are hard to read, - and training data is exorbitantly expensive. So what are you left with? - A common answer is NLTK, which was written primarily as an educational resource. - Nothing past the tokenizer is suitable for production use. - - p. - I used to think that the NLP community just needed to do more to communicate - its findings to software engineers. So I wrote two blog posts, explaining - !{how_to_write_a_POS_tagger} and !{parser_lnk}. Both were well - received, and there's been a bit of interest in !{my_research_software} - – even though it's entirely undocumented, and mostly unuseable to - anyone but me. - p. - So six months ago I quit my post-doc, and I've been working day and night - on spaCy since. I'm now pleased to announce an alpha release. - - p. - If you're a small company doing NLP, I think spaCy will seem like a minor - miracle. It's by far the fastest NLP software ever released. The - full processing pipeline completes in 20ms per document, including accurate - tagging and parsing. All strings are mapped to integer IDs, tokens are - linked to embedded word representations, and a range of useful features - are pre-calculated and cached. - - p. - If none of that made any sense to you, here's the gist of it. Computers - don't understand text. This is unfortunate, because that's what the - web almost entirely consists of. We want to recommend people text based - on other text they liked. We want to shorten text to display it on a - mobile screen. We want to aggregate it, link it, filter it, categorise - it, generate it and correct it. - - p. - spaCy provides a library of utility functions that help programmers - build such products. It's commercial open source software: you can - either use it under the AGPL, or you can !{buy_a_commercial_license} - under generous terms. - - footer(role='contentinfo') diff --git a/docs/redesign/blog_parser.jade b/docs/redesign/blog_parser.jade deleted file mode 100644 index e94376e32..000000000 --- a/docs/redesign/blog_parser.jade +++ /dev/null @@ -1,938 +0,0 @@ -extends ./template_post.jade - - -block body_block - - var urls = {} - //- urls.pos_post = 'https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/' - - urls.parser_post = "http://googleresearch.blogspot.com.au/2013/05/syntactic-ngrams-over-time.html" - - urls.implementation = 'https://gist.github.com/syllog1sm/10343947' - - urls.redshift = 'http://github.com/syllog1sm/redshift' - - urls.tasker = 'https://play.google.com/store/apps/details?id=net.dinglisch.android.taskerm' - - urls.acl_anthology = 'http://aclweb.org/anthology/' - - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" - - // A comment - - article.post - header - h2 Parsing English in 500 lines of Python - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2013-12-18') December 18, 2013 - p - | A - a(href=urls.google_ngrams) syntactic parser - | describes a sentence’s grammatical structure, to help another - | application reason about it. Natural languages introduce many unexpected - | ambiguities, which our world-knowledge immediately filters out. A - | favourite example: - - p.example They ate the pizza with anchovies - - p - img(src='img/blog01.png', alt='Eat-with pizza-with ambiguity') - p - | A correct parse links “with” to “pizza”, while an incorrect parse - | links “with” to “eat”: - - .displacy - iframe(src='displacy/anchovies_bad.html', height='275') - - .displacy - iframe.displacy(src='displacy/anchovies_good.html', height='275') - a.view-displacy(href='#') View on displaCy - p.caption - | The Natural Language Processing (NLP) community has made big progress - | in syntactic parsing over the last few years. - - p - | The Natural Language Processing (NLP) community has made big progress - | in syntactic parsing over the last few years. It’s now possible for - | a tiny Python implementation to perform better than the widely-used - | Stanford PCFG parser. - - p - strong Update! - | The Stanford CoreNLP library now includes a greedy transition-based - | dependency parser, similar to the one described in this post, but with - | an improved learning strategy. It is much faster and more accurate - | than this simple Python implementation. - - table - thead - tr - th Parser - th Accuracy - th Speed (w/s) - th Language - th LOC - tbody - tr - td Stanford - td 89.6% - td 19 - td Java - td - | > 4,000 - sup - a(href='#note-1') [1] - tr - td - strong parser.py - td 89.8% - td 2,020 - td Python - strong ~500 - tr - td Redshift - td - strong 93.6% - td - strong 2,580 - td Cython - td ~4,000 - p - | The rest of the post sets up the problem, and then takes you through - a(href=urls.implementation) a concise implementation - | , prepared for this post. The first 200 lines of parser.py, the - | part-of-speech tagger and learner, are described - a(href=pos_tagger_url) here. You should probably at least skim that - | post before reading this one, unless you’re very familiar with NLP - | research. - p - | The Cython system, Redshift, was written for my current research. I - | plan to improve it for general use in June, after my contract ends - | at Macquarie University. The current version is - a(href=urls.redshift) hosted on GitHub - | . - h3 Problem Description - - p It’d be nice to type an instruction like this into your phone: - - p.example - Set volume to zero when I’m in a meeting, unless John’s school calls. - p - | And have it set the appropriate policy. On Android you can do this - | sort of thing with - a(href=urls.tasker) Tasker - | , but an NL interface would be much better. It’d be especially nice - | to receive a meaning representation you could edit, so you could see - | what it thinks you said, and correct it. - p - | There are lots of problems to solve to make that work, but some sort - | of syntactic representation is definitely necessary. We need to know that: - - p.example - Unless John’s school calls, when I’m in a meeting, set volume to zero - - p is another way of phrasing the first instruction, while: - - p.example - Unless John’s school, call when I’m in a meeting - - p means something completely different. - - p - | A dependency parser returns a graph of word-word relationships, - | intended to make such reasoning easier. Our graphs will be trees – - | edges will be directed, and every node (word) will have exactly one - | incoming arc (one dependency, with its head), except one. - - h4 Example usage - - pre.language-python - code - | parser = parser.Parser() - | tokens = "Set the volume to zero when I 'm in a meeting unless John 's school calls".split() - | >>> tags, heads = parser.parse(tokens) - | >>> heads - | [-1, 2, 0, 0, 3, 0, 7, 5, 7, 10, 8, 0, 13, 15, 15, 11] - | >>> for i, h in enumerate(heads): - | ... head = tokens[heads[h]] if h >= 1 else 'None' - | ... print(tokens[i] + ' <-- ' + head]) - | Set <-- None - | the <-- volume - | volume <-- Set - | to <-- Set - | zero <-- to - | when <-- Set - | I <-- 'm - | 'm <-- when - | in <-- 'm - | a <-- meeting - | meeting <-- in - | unless <-- Set - | John <-- 's - | 's <-- calls - | school <-- calls - | calls <-- unless - - p. - The idea is that it should be slightly easier to reason from the parse, - than it was from the string. The parse-to-meaning mapping is hopefully - simpler than the string-to-meaning mapping. - - p. - The most confusing thing about this problem area is that “correctness” - is defined by convention — by annotation guidelines. If you haven’t - read the guidelines and you’re not a linguist, you can’t tell whether - the parse is “wrong” or “right”, which makes the whole task feel weird - and artificial. - - p. - For instance, there’s a mistake in the parse above: “John’s school - calls” is structured wrongly, according to the Stanford annotation - guidelines. The structure of that part of the sentence is how the - annotators were instructed to parse an example like “John’s school - clothes”. - - p - | It’s worth dwelling on this point a bit. We could, in theory, have - | written our guidelines so that the “correct” parses were reversed. - | There’s good reason to believe the parsing task will be harder if we - | reversed our convention, as it’d be less consistent with the rest of - | the grammar. - sup: a(href='#note-2') [2] - | But we could test that empirically, and we’d be pleased to gain an - | advantage by reversing the policy. - - p - | We definitely do want that distinction in the guidelines — we don’t - | want both to receive the same structure, or our output will be less - | useful. The annotation guidelines strike a balance between what - | distinctions downstream applications will find useful, and what - | parsers will be able to predict easily. - - h4 Projective trees - - p - | There’s a particularly useful simplification that we can make, when - | deciding what we want the graph to look like: we can restrict the - | graph structures we’ll be dealing with. This doesn’t just give us a - | likely advantage in learnability; it can have deep algorithmic - | implications. We follow most work on English in constraining the - | dependency graphs to be - em projective trees - | : - - ol - li Tree. Every word has exactly one head, except for the dummy ROOT symbol. - li - | Projective. For every pair of dependencies (a1, a2) and (b1, b2), - | if a1 < b2, then a2 >= b2. In other words, dependencies cannot “cross”. - | You can’t have a pair of dependencies that goes a1 b1 a2 b2, or - | b1 a1 b2 a2. - - p - | There’s a rich literature on parsing non-projective trees, and a - | smaller literature on parsing DAGs. But the parsing algorithm I’ll - | be explaining deals with projective trees. - - h3 Greedy transition-based parsing - - p - | Our parser takes as input a list of string tokens, and outputs a - | list of head indices, representing edges in the graph. If the - - em i - - | th member of heads is - - em j - - | , the dependency parse contains an edge (j, i). A transition-based - | parser is a finite-state transducer; it maps an array of N words - | onto an output array of N head indices: - - table.center - tbody - tr - td - em start - td MSNBC - td reported - td that - td Facebook - td bought - td WhatsApp - td for - td $16bn - td - em root - tr - td 0 - td 2 - td 9 - td 2 - td 4 - td 2 - td 4 - td 4 - td 7 - td 0 - p - | The heads array denotes that the head of - em MSNBC - | is - em reported - | : - em MSNBC - | is word 1, and - em reported - | is word 2, and - code.language-python heads[1] == 2 - | . You can already see why parsing a tree is handy — this data structure - | wouldn’t work if we had to output a DAG, where words may have multiple - | heads. - - p - | Although - code.language-python heads - | can be represented as an array, we’d actually like to maintain some - | alternate ways to access the parse, to make it easy and efficient to - | extract features. Our - - code.language-python Parse - | class looks like this: - - pre.language-python - code - | class Parse(object): - | def __init__(self, n): - | self.n = n - | self.heads = [None] * (n-1) - | self.lefts = [] - | self.rights = [] - | for i in range(n+1): - | self.lefts.append(DefaultList(0)) - | self.rights.append(DefaultList(0)) - | - | def add_arc(self, head, child): - | self.heads[child] = head - | if child < head: - | self.lefts[head].append(child) - | else: - | self.rights[head].append(child) - - p - | As well as the parse, we also have to keep track of where we’re up - | to in the sentence. We’ll do this with an index into the - code.language-python words - | array, and a stack, to which we’ll push words, before popping them - | once their head is set. So our state data structure is fundamentally: - - ul - li An index, i, into the list of tokens; - li The dependencies added so far, in Parse - li - | A stack, containing words that occurred before i, for which we’re - | yet to assign a head. - - p Each step of the parsing process applies one of three actions to the state: - - pre.language-python - code - | SHIFT = 0; RIGHT = 1; LEFT = 2 - | MOVES = [SHIFT, RIGHT, LEFT] - | - | def transition(move, i, stack, parse): - | global SHIFT, RIGHT, LEFT - | if move == SHIFT: - | stack.append(i) - | return i + 1 - | elif move == RIGHT: - | parse.add_arc(stack[-2], stack.pop()) - | return i - | elif move == LEFT: - | parse.add_arc(i, stack.pop()) - | return i - | raise GrammarError("Unknown move: %d" % move) - - - - p - | The - code.language-python LEFT - | and - code.language-python RIGHT - | actions add dependencies and pop the stack, while - code.language-python SHIFT - | pushes the stack and advances i into the buffer. - p. - So, the parser starts with an empty stack, and a buffer index at 0, with - no dependencies recorded. It chooses one of the (valid) actions, and - applies it to the state. It continues choosing actions and applying - them until the stack is empty and the buffer index is at the end of - the input. (It’s hard to understand this sort of algorithm without - stepping through it. Try coming up with a sentence, drawing a projective - parse tree over it, and then try to reach the parse tree by choosing - the right sequence of transitions.) - - p Here’s what the parsing loop looks like in code: - - pre.language-python - code - | class Parser(object): - | ... - | def parse(self, words): - | tags = self.tagger(words) - | n = len(words) - | idx = 1 - | stack = [0] - | deps = Parse(n) - | while stack or idx < n: - | features = extract_features(words, tags, idx, n, stack, deps) - | scores = self.model.score(features) - | valid_moves = get_valid_moves(i, n, len(stack)) - | next_move = max(valid_moves, key=lambda move: scores[move]) - | idx = transition(next_move, idx, stack, parse) - | return tags, parse - | - | def get_valid_moves(i, n, stack_depth): - | moves = [] - | if i < n: - | moves.append(SHIFT) - | if stack_depth <= 2: - | moves.append(RIGHT) - | if stack_depth <= 1: - | moves.append(LEFT) - | return moves - - p. - We start by tagging the sentence, and initializing the state. We then - map the state to a set of features, which we score using a linear model. - We then find the best-scoring valid move, and apply it to the state. - - p - | The model scoring works the same as it did in - a(href=urls.post) the POS tagger. - | If you’re confused about the idea of extracting features and scoring - | them with a linear model, you should review that post. Here’s a reminder - | of how the model scoring works: - - pre.language-python - code - | class Perceptron(object) - | ... - | def score(self, features): - | all_weights = self.weights - | scores = dict((clas, 0) for clas in self.classes) - | for feat, value in features.items(): - | if value == 0: - | continue - | if feat not in all_weights: - | continue - | weights = all_weights[feat] - | for clas, weight in weights.items(): - | scores[clas] += value * weight - | return scores - - p. - It’s just summing the class-weights for each feature. This is often - expressed as a dot-product, but when you’re dealing with multiple - classes, that gets awkward, I find. - - p. - The beam parser (RedShift) tracks multiple candidates, and only decides - on the best one at the very end. We’re going to trade away accuracy - in favour of efficiency and simplicity. We’ll only follow a single - analysis. Our search strategy will be entirely greedy, as it was with - the POS tagger. We’ll lock-in our choices at every step. - - p. - If you read the POS tagger post carefully, you might see the underlying - similarity. What we’ve done is mapped the parsing problem onto a - sequence-labelling problem, which we address using a “flat”, or unstructured, - learning algorithm (by doing greedy search). - - h3 Features - p. - Feature extraction code is always pretty ugly. The features for the parser - refer to a few tokens from the context: - - ul - li The first three words of the buffer (n0, n1, n2) - li The top three words of the stack (s0, s1, s2) - li The two leftmost children of s0 (s0b1, s0b2); - li The two rightmost children of s0 (s0f1, s0f2); - li The two leftmost children of n0 (n0b1, n0b2) - - p. - For these 12 tokens, we refer to the word-form, the part-of-speech tag, - and the number of left and right children attached to the token. - - p. - Because we’re using a linear model, we have our features refer to pairs - and triples of these atomic properties. - - pre.language-python - code - | def extract_features(words, tags, n0, n, stack, parse): - | def get_stack_context(depth, stack, data): - | if depth >= 3: - | return data[stack[-1]], data[stack[-2]], data[stack[-3]] - | elif depth >= 2: - | return data[stack[-1]], data[stack[-2]], '' - | elif depth == 1: - | return data[stack[-1]], '', '' - | else: - | return '', '', '' - | - | def get_buffer_context(i, n, data): - | if i + 1 >= n: - | return data[i], '', '' - | elif i + 2 >= n: - | return data[i], data[i + 1], '' - | else: - | return data[i], data[i + 1], data[i + 2] - | - | def get_parse_context(word, deps, data): - | if word == -1: - | return 0, '', '' - | deps = deps[word] - | valency = len(deps) - | if not valency: - | return 0, '', '' - | elif valency == 1: - | return 1, data[deps[-1]], '' - | else: - | return valency, data[deps[-1]], data[deps[-2]] - | - | features = {} - | # Set up the context pieces --- the word, W, and tag, T, of: - | # S0-2: Top three words on the stack - | # N0-2: First three words of the buffer - | # n0b1, n0b2: Two leftmost children of the first word of the buffer - | # s0b1, s0b2: Two leftmost children of the top word of the stack - | # s0f1, s0f2: Two rightmost children of the top word of the stack - | - | depth = len(stack) - | s0 = stack[-1] if depth else -1 - | - | Ws0, Ws1, Ws2 = get_stack_context(depth, stack, words) - | Ts0, Ts1, Ts2 = get_stack_context(depth, stack, tags) - | - | Wn0, Wn1, Wn2 = get_buffer_context(n0, n, words) - | Tn0, Tn1, Tn2 = get_buffer_context(n0, n, tags) - | - | Vn0b, Wn0b1, Wn0b2 = get_parse_context(n0, parse.lefts, words) - | Vn0b, Tn0b1, Tn0b2 = get_parse_context(n0, parse.lefts, tags) - | - | Vn0f, Wn0f1, Wn0f2 = get_parse_context(n0, parse.rights, words) - | _, Tn0f1, Tn0f2 = get_parse_context(n0, parse.rights, tags) - | - | Vs0b, Ws0b1, Ws0b2 = get_parse_context(s0, parse.lefts, words) - | _, Ts0b1, Ts0b2 = get_parse_context(s0, parse.lefts, tags) - | - | Vs0f, Ws0f1, Ws0f2 = get_parse_context(s0, parse.rights, words) - | _, Ts0f1, Ts0f2 = get_parse_context(s0, parse.rights, tags) - | - | # Cap numeric features at 5? - | # String-distance - | Ds0n0 = min((n0 - s0, 5)) if s0 != 0 else 0 - | - | features['bias'] = 1 - | # Add word and tag unigrams - | for w in (Wn0, Wn1, Wn2, Ws0, Ws1, Ws2, Wn0b1, Wn0b2, Ws0b1, Ws0b2, Ws0f1, Ws0f2): - | if w: - | features['w=%s' % w] = 1 - | for t in (Tn0, Tn1, Tn2, Ts0, Ts1, Ts2, Tn0b1, Tn0b2, Ts0b1, Ts0b2, Ts0f1, Ts0f2): - | if t: - | features['t=%s' % t] = 1 - | - | # Add word/tag pairs - | for i, (w, t) in enumerate(((Wn0, Tn0), (Wn1, Tn1), (Wn2, Tn2), (Ws0, Ts0))): - | if w or t: - | features['%d w=%s, t=%s' % (i, w, t)] = 1 - | - | # Add some bigrams - | features['s0w=%s, n0w=%s' % (Ws0, Wn0)] = 1 - | features['wn0tn0-ws0 %s/%s %s' % (Wn0, Tn0, Ws0)] = 1 - | features['wn0tn0-ts0 %s/%s %s' % (Wn0, Tn0, Ts0)] = 1 - | features['ws0ts0-wn0 %s/%s %s' % (Ws0, Ts0, Wn0)] = 1 - | features['ws0-ts0 tn0 %s/%s %s' % (Ws0, Ts0, Tn0)] = 1 - | features['wt-wt %s/%s %s/%s' % (Ws0, Ts0, Wn0, Tn0)] = 1 - | features['tt s0=%s n0=%s' % (Ts0, Tn0)] = 1 - | features['tt n0=%s n1=%s' % (Tn0, Tn1)] = 1 - | - | # Add some tag trigrams - | trigrams = ((Tn0, Tn1, Tn2), (Ts0, Tn0, Tn1), (Ts0, Ts1, Tn0), - | (Ts0, Ts0f1, Tn0), (Ts0, Ts0f1, Tn0), (Ts0, Tn0, Tn0b1), - | (Ts0, Ts0b1, Ts0b2), (Ts0, Ts0f1, Ts0f2), (Tn0, Tn0b1, Tn0b2), - | (Ts0, Ts1, Ts1)) - | for i, (t1, t2, t3) in enumerate(trigrams): - | if t1 or t2 or t3: - | features['ttt-%d %s %s %s' % (i, t1, t2, t3)] = 1 - | - | # Add some valency and distance features - | vw = ((Ws0, Vs0f), (Ws0, Vs0b), (Wn0, Vn0b)) - | vt = ((Ts0, Vs0f), (Ts0, Vs0b), (Tn0, Vn0b)) - | d = ((Ws0, Ds0n0), (Wn0, Ds0n0), (Ts0, Ds0n0), (Tn0, Ds0n0), - | ('t' + Tn0+Ts0, Ds0n0), ('w' + Wn0+Ws0, Ds0n0)) - | for i, (w_t, v_d) in enumerate(vw + vt + d): - | if w_t or v_d: - | features['val/d-%d %s %d' % (i, w_t, v_d)] = 1 - | return features - - - h3 Training - - p. - Weights are learned using the same algorithm, averaged perceptron, that - we used for part-of-speech tagging. Its key strength is that it’s an - online learning algorithm: examples stream in one-by-one, we make our - prediction, check the actual answer, and adjust our beliefs (weights) - if we were wrong. - - p The training loop looks like this: - - pre.language-python - code - | class Parser(object): - | ... - | def train_one(self, itn, words, gold_tags, gold_heads): - | n = len(words) - | i = 2; stack = [1]; parse = Parse(n) - | tags = self.tagger.tag(words) - | while stack or (i + 1) < n: - | features = extract_features(words, tags, i, n, stack, parse) - | scores = self.model.score(features) - | valid_moves = get_valid_moves(i, n, len(stack)) - | guess = max(valid_moves, key=lambda move: scores[move]) - | gold_moves = get_gold_moves(i, n, stack, parse.heads, gold_heads) - | best = max(gold_moves, key=lambda move: scores[move]) - | self.model.update(best, guess, features) - | i = transition(guess, i, stack, parse) - | # Return number correct - | return len([i for i in range(n-1) if parse.heads[i] == gold_heads[i]]) - - - - p - | The most interesting part of the training process is in - code.language-python get_gold_moves. - | The performance of our parser is made possible by an advance by Goldberg - | and Nivre (2012), who showed that we’d been doing this wrong for years. - - p - | In the POS-tagging post, I cautioned that during training you need to - | make sure you pass in the last two - em predicted - | tags as features for the current tag, not the last two - em gold - | tags. At test time you’ll only have the predicted tags, so if you - | base your features on the gold sequence during training, your training - | contexts won’t resemble your test-time contexts, so you’ll learn the - | wrong weights. - - p. - In parsing, the problem was that we didn’t know - em how - | to pass in the predicted sequence! Training worked by taking the - | gold-standard tree, and finding a transition sequence that led to it. - | i.e., you got back a sequence of moves, with the guarantee that if - | you followed those moves, you’d get the gold-standard dependencies. - - p - | The problem is, we didn’t know how to define the “correct” move to - | teach a parser to make if it was in any state that - em wasn’t - | along that gold-standard sequence. Once the parser had made a mistake, - | we didn’t know how to train from that example. - - p - | That was a big problem, because it meant that once the parser started - | making mistakes, it would end up in states unlike any in its training - | data – leading to yet more mistakes. The problem was specific - | to greedy parsers: once you use a beam, there’s a natural way to do - | structured prediction. - p - | The solution seems obvious once you know it, like all the best breakthroughs. - | What we do is define a function that asks “How many gold-standard - | dependencies can be recovered from this state?”. If you can define - | that function, then you can apply each move in turn, and ask, “How - | many gold-standard dependencies can be recovered from - em this - | state?”. If the action you applied allows - em fewer - | gold-standard dependencies to be reached, then it is sub-optimal. - - p That’s a lot to take in. - - p - | So we have this function - code Oracle(state) - | : - pre - code - | Oracle(state) = | gold_arcs ∩ reachable_arcs(state) | - p - | We also have a set of actions, each of which returns a new state. - | We want to know: - - ul - li shift_cost = Oracle(state) – Oracle(shift(state)) - li right_cost = Oracle(state) – Oracle(right(state)) - li left_cost = Oracle(state) – Oracle(left(state)) - - p - | Now, at least one of those costs - em has - | to be zero. Oracle(state) is asking, “what’s the cost of the best - | path forward?”, and the first action of that best path has to be - | shift, right, or left. - - p - | It turns out that we can derive Oracle fairly simply for many transition - | systems. The derivation for the transition system we’re using, Arc - | Hybrid, is in Goldberg and Nivre (2013). - - p - | We’re going to implement the oracle as a function that returns the - | zero-cost moves, rather than implementing a function Oracle(state). - | This prevents us from doing a bunch of costly copy operations. - | Hopefully the reasoning in the code isn’t too hard to follow, but - | you can also consult Goldberg and Nivre’s papers if you’re confused - | and want to get to the bottom of this. - - pre.language-python - code - | def get_gold_moves(n0, n, stack, heads, gold): - | def deps_between(target, others, gold): - | for word in others: - | if gold[word] == target or gold[target] == word: - | return True - | return False - | - | valid = get_valid_moves(n0, n, len(stack)) - | if not stack or (SHIFT in valid and gold[n0] == stack[-1]): - | return [SHIFT] - | if gold[stack[-1]] == n0: - | return [LEFT] - | costly = set([m for m in MOVES if m not in valid]) - | # If the word behind s0 is its gold head, Left is incorrect - | if len(stack) >= 2 and gold[stack[-1]] == stack[-2]: - | costly.add(LEFT) - | # If there are any dependencies between n0 and the stack, - | # pushing n0 will lose them. - | if SHIFT not in costly and deps_between(n0, stack, gold): - | costly.add(SHIFT) - | # If there are any dependencies between s0 and the buffer, popping - | # s0 will lose them. - | if deps_between(stack[-1], range(n0+1, n-1), gold): - | costly.add(LEFT) - | costly.add(RIGHT) - | return [m for m in MOVES if m not in costly] - - - - p - | Doing this “dynamic oracle” training procedure makes a big difference - | to accuracy — typically 1-2%, with no difference to the way the run-time - | works. The old “static oracle” greedy training procedure is fully - | obsolete; there’s no reason to do it that way any more. - - h3 Conclusion - - p - | I have the sense that language technologies, particularly those relating - | to grammar, are particularly mysterious. I can imagine having no idea - | what the program might even do. - - p - | I think it therefore seems natural to people that the best solutions - | would be over-whelmingly complicated. A 200,000 line Java package - | feels appropriate. - p - | But, algorithmic code is usually short, when only a single algorithm - | is implemented. And when you only implement one algorithm, and you - | know exactly what you want to write before you write a line, you - | also don’t pay for any unnecessary abstractions, which can have a - | big performance impact. - - h3 Notes - p - a(name='note-1') - | [1] I wasn’t really sure how to count the lines of code in the Stanford - | parser. Its jar file ships over 200k, but there are a lot of different - | models in it. It’s not important, but it's certainly over 4k. - - p - a(name='note-2') - | [2] For instance, how would you parse, “John’s school of music calls”? - | You want to make sure the phrase “John’s school” has a consistent - | structure in both “John’s school calls” and “John’s school of music - | calls”. Reasoning about the different “slots” you can put a phrase - | into is a key way we reason about what syntactic analyses look like. - | You can think of each phrase as having a different shaped connector, - | which you need to plug into different slots — which each phrase also - | has a certain number of, each of a different shape. We’re trying to - | figure out what connectors are where, so we can figure out how the - | sentences are put together. - - h3 Idle speculation - p - | For a long time, incremental language processing algorithms were - | primarily of scientific interest. If you want to write a parser to - | test a theory about how the human sentence processor might work, well, - | that parser needs to build partial interpretations. There’s a wealth - | of evidence, including commonsense introspection, that establishes - | that we don’t buffer input and analyse it once the speaker has finished. - - p - | But now algorithms with that neat scientific feature are winning! - | As best as I can tell, the secret to that success is to be: - - ul - li Incremental. Earlier words constrain the search. - li - | Error-driven. Training involves a working hypothesis, which is - | updated as it makes mistakes. - - p - | The links to human sentence processing seem tantalising. I look - | forward to seeing whether these engineering breakthroughs lead to - | any psycholinguistic advances. - - h3 Bibliography - - p - | The NLP literature is almost entirely open access. All of the relavant - | papers can be found - a(href=urls.acl_anthology, rel='nofollow') here - | . - p - | The parser I’ve described is an implementation of the dynamic-oracle - | Arc-Hybrid system here: - - span.bib-item - | Goldberg, Yoav; Nivre, Joakim. - em Training Deterministic Parsers with Non-Deterministic Oracles - | . TACL 2013 - p - | However, I wrote my own features for it. The arc-hybrid system was - | originally described here: - - span.bib-item - | Kuhlmann, Marco; Gomez-Rodriguez, Carlos; Satta, Giorgio. Dynamic - | programming algorithms for transition-based dependency parsers. ACL 2011 - - p - | The dynamic oracle training method was first described here: - span.bib-item - | A Dynamic Oracle for Arc-Eager Dependency Parsing. Goldberg, Yoav; - | Nivre, Joakim. COLING 2012 - - p - | This work depended on a big break-through in accuracy for transition-based - | parsers, when beam-search was properly explored by Zhang and Clark. - | They have several papers, but the preferred citation is: - - span.bib-item - | Zhang, Yue; Clark, Steven. Syntactic Processing Using the Generalized - | Perceptron and Beam Search. Computational Linguistics 2011 (1) - p - | Another important paper was this little feature engineering paper, - | which further improved the accuracy: - - span.bib-item - | Zhang, Yue; Nivre, Joakim. Transition-based Dependency Parsing with - | Rich Non-local Features. ACL 2011 - - p - | The generalised perceptron, which is the learning framework for these - | beam parsers, is from this paper: - span.bib-item - | Collins, Michael. Discriminative Training Methods for Hidden Markov - | Models: Theory and Experiments with Perceptron Algorithms. EMNLP 2002 - - h3 Experimental details - p - | The results at the start of the post refer to Section 22 of the Wall - | Street Journal corpus. The Stanford parser was run as follows: - - pre.language-bash - code - | java -mx10000m -cp "$scriptdir/*:" edu.stanford.nlp.parser.lexparser.LexicalizedParser \ - | -outputFormat "penn" edu/stanford/nlp/models/lexparser/englishFactored.ser.gz $* - - - - p - | A small post-process was applied, to undo the fancy tokenisation - | Stanford adds for numbers, to make them match the PTB tokenisation: - - pre.language-python - code - | """Stanford parser retokenises numbers. Split them.""" - | import sys - | import re - | - | qp_re = re.compile('\xc2\xa0') - | for line in sys.stdin: - | line = line.rstrip() - | if qp_re.search(line): - | line = line.replace('(CD', '(QP (CD', 1) + ')' - | line = line.replace('\xc2\xa0', ') (CD ') - | print line - - p - | The resulting PTB-format files were then converted into dependencies - | using the Stanford converter: - - pre.language-bash - code - | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp - | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ - | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll - p - | I can’t easily read that anymore, but it should just convert every - | .mrg file in a folder to a CoNLL-format Stanford basic dependencies - | file, using the settings common in the dependency literature. - - p - | I then converted the gold-standard trees from WSJ 22, for the evaluation. - | Accuracy scores refer to unlabelled attachment score (i.e. the head index) - | of all non-punctuation tokens. - - p - | To train parser.py, I fed the gold-standard PTB trees for WSJ 02-21 - | into the same conversion script. - - p - | In a nutshell: The Stanford model and parser.py are trained on the - | same set of sentences, and they each make their predictions on a - | held-out test set, for which we know the answers. Accuracy refers - | to how many of the words’ heads we got correct. - - p - | Speeds were measured on a 2.4Ghz Xeon. I ran the experiments on a - | server, to give the Stanford parser more memory. The parser.py system - | runs fine on my MacBook Air. I used PyPy for the parser.py experiments; - | CPython was about half as fast on an early benchmark. - - p - | One of the reasons parser.py is so fast is that it does unlabelled - | parsing. Based on previous experiments, a labelled parser would likely - | be about 40x slower, and about 1% more accurate. Adapting the program - | to labelled parsing would be a good exercise for the reader, if you - | have access to the data. - - p - | The result from the Redshift parser was produced from commit - code.language-python b6b624c9900f3bf - | , which was run as follows: - pre.language-bash - code - | ./scripts/train.py -x zhang+stack -k 8 -p ~/data/stanford/train.conll ~/data/parsers/tmp - | ./scripts/parse.py ~/data/parsers/tmp ~/data/stanford/devi.txt /tmp/parse/ - | ./scripts/evaluate.py /tmp/parse/parses ~/data/stanford/dev.conll< - - footer.meta(role='contentinfo') - a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter - .discuss - a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News - | - a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit diff --git a/docs/redesign/blog_tagger.jade b/docs/redesign/blog_tagger.jade deleted file mode 100644 index 63ac8e77e..000000000 --- a/docs/redesign/blog_tagger.jade +++ /dev/null @@ -1,492 +0,0 @@ -extends ./template_post.jade - -block body_block - - var urls = {} - - urls.share_twitter = "http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" - - - article.post - header - h2 A good Part-of-Speech tagger in about 200 lines of Python - .subhead - | by - a(href="#" rel="author") Matthew Honnibal - | on - time(datetime='2013-09-11') October 11, 2013 - - p. - Up-to-date knowledge about natural language processing is mostly locked away - in academia. And academics are mostly pretty self-conscious when we write. - We’re careful. We don’t want to stick our necks out too much. But under-confident - recommendations suck, so here’s how to write a good part-of-speech tagger. - - p. - There are a tonne of “best known techniques” for POS tagging, and you should - ignore the others and just use Averaged Perceptron. - - p. - You should use two tags of history, and features derived from the Brown word - clusters distributed here. - - p. - If you only need the tagger to work on carefully edited text, you should - use case-sensitive features, but if you want a more robust tagger you - should avoid them because they’ll make you over-fit to the conventions - of your training domain. Instead, features that ask “how frequently is - this word title-cased, in a large sample from the web?” work well. Then - you can lower-case your comparatively tiny training corpus. - - p. - For efficiency, you should figure out which frequent words in your training - data have unambiguous tags, so you don’t have to do anything but output - their tags when they come up. About 50% of the words can be tagged that way. - - p. - And unless you really, really can’t do without an extra 0.1% of accuracy, - you probably shouldn’t bother with any kind of search strategy you should - just use a greedy model. - - p. - If you do all that, you’ll find your tagger easy to write and understand, - and an efficient Cython implementation will perform as follows on the standard - evaluation, 130,000 words of text from the Wall Street Journal: - - table - thead - tr - th Tagger - th Accuracy - th Time (130k words) - tbody - tr - td CyGreedyAP - td 97.1% - td 4s - - p. - The 4s includes initialisation time — the actual per-token speed is high - enough to be irrelevant; it won’t be your bottleneck. - - p. - It’s tempting to look at 97% accuracy and say something similar, but that’s - not true. My parser is about 1% more accurate if the input has hand-labelled - POS tags, and the taggers all perform much worse on out-of-domain data. - Unfortunately accuracies have been fairly flat for the last ten years. - That’s why my recommendation is to just use a simple and fast tagger that’s - roughly as good. - - p. - The thing is though, it’s very common to see people using taggers that - aren’t anywhere near that good! For an example of what a non-expert is - likely to use, these were the two taggers wrapped by TextBlob, a new Python - api that I think is quite neat: - - table - thead - tr - th Tagger - th Accuracy - th Time (130k words) - tbody - tr - td NLTK - td 94.0% - td 3m56s - tr - td Pattern - td 93.5% - td 26s - - p. - Both Pattern and NLTK are very robust and beautifully well documented, so - the appeal of using them is obvious. But Pattern’s algorithms are pretty - crappy, and NLTK carries tremendous baggage around in its implementation - because of its massive framework, and double-duty as a teaching tool. - - p. - As a stand-alone tagger, my Cython implementation is needlessly complicated - – it was written for my parser. So today I wrote a 200 line version - of my recommended algorithm for TextBlob. It gets: - - table - thead - tr - th Tagger - th Accuracy - th Time (130k words) - tbody - tr - td PyGreedyAP - td 96.8% - td 12s - - p. - I traded some accuracy and a lot of efficiency to keep the implementation - simple. Here’s a far-too-brief description of how it works. - - h3 Averaged perceptron - - p. - POS tagging is a “supervised learning problem”. You’re given a table of data, - and you’re told that the values in the last column will be missing during - run-time. You have to find correlations from the other columns to predict - that value. - - p. - So for us, the missing column will be “part of speech at word i“. The predictor - columns (features) will be things like “part of speech at word i-1“, “last three - letters of word at i+1“, etc - - p. - First, here’s what prediction looks like at run-time: - - pre.language-python - code - | def predict(self, features): - | '''Dot-product the features and current weights and return the best class.''' - | scores = defaultdict(float) - | for feat in features: - | if feat not in self.weights: - | continue - | weights = self.weights[feat] - | for clas, weight in weights.items(): - | scores[clas] += weight - | # Do a secondary alphabetic sort, for stability - | return max(self.classes, key=lambda clas: (scores[clas], clas)) - - p. - Earlier I described the learning problem as a table, with one of the columns - marked as missing-at-runtime. For NLP, our tables are always exceedingly - sparse. You have columns like “word i-1=Parliament”, which is almost always - 0. So our “weight vectors” can pretty much never be implemented as vectors. - Map-types are good though — here we use dictionaries. - - p. - The input data, features, is a set with a member for every non-zero “column” - in our “table” – every active feature. Usually this is actually a dictionary, - to let you set values for the features. But here all my features are binary - present-or-absent type deals. - - p. - The weights data-structure is a dictionary of dictionaries, that ultimately - associates feature/class pairs with some weight. You want to structure it - this way instead of the reverse because of the way word frequencies are - distributed: most words are rare, frequent words are very frequent. - - h3 Learning the weights - - p. - Okay, so how do we get the values for the weights? We start with an empty - weights dictionary, and iteratively do the following: - - ol - li Receive a new (features, POS-tag) pair - li Guess the value of the POS tag given the current “weights” for the features - li If guess is wrong, add +1 to the weights associated with the correct class for these features, and -1 to the weights for the predicted class. - - - p. - It’s one of the simplest learning algorithms. Whenever you make a mistake, - increment the weights for the correct class, and penalise the weights that - led to your false prediction. In code: - - pre.language-python - code - | def train(self, nr_iter, examples): - | for i in range(nr_iter): - | for features, true_tag in examples: - | guess = self.predict(features) - | if guess != true_tag: - | for f in features: - | self.weights[f][true_tag] += 1 - | self.weights[f][guess] -= 1 - | random.shuffle(examples) - p. - If you iterate over the same example this way, the weights for the correct - class would have to come out ahead, and you’d get the example right. If - you think about what happens with two examples, you should be able to - see that it will get them both right unless the features are identical. - In general the algorithm will converge so long as the examples are - linearly separable, although that doesn’t matter for our purpose. - - h3 Averaging the weights - - p. - We need to do one more thing to make the perceptron algorithm competitive. - The problem with the algorithm so far is that if you train it twice on - slightly different sets of examples, you end up with really different models. - It doesn’t generalise that smartly. And the problem is really in the later - iterations — if you let it run to convergence, it’ll pay lots of attention - to the few examples it’s getting wrong, and mutate its whole model around - them. - - p. - So, what we’re going to do is make the weights more "sticky" – give - the model less chance to ruin all its hard work in the later rounds. And - we’re going to do that by returning the averaged weights, not the final - weights. - - p. - I doubt there are many people who are convinced that’s the most obvious - solution to the problem, but whatever. We’re not here to innovate, and this - way is time tested on lots of problems. If you have another idea, run the - experiments and tell us what you find. Actually I’d love to see more work - on this, now that the averaged perceptron has become such a prominent learning - algorithm in NLP. - - p. - Okay. So this averaging. How’s that going to work? Note that we don’t want - to just average after each outer-loop iteration. We want the average of all - the values — from the inner loop. So if we have 5,000 examples, and we train - for 10 iterations, we’ll average across 50,000 values for each weight. - - p. - Obviously we’re not going to store all those intermediate values. Instead, - we’ll track an accumulator for each weight, and divide it by the number of - iterations at the end. Again: we want the average weight assigned to a - feature/class pair during learning, so the key component we need is the total - weight it was assigned. But we also want to be careful about how we compute - that accumulator, too. On almost any instance, we’re going to see a tiny - fraction of active feature/class pairs. All the other feature/class weights - won’t change. So we shouldn’t have to go back and add the unchanged value - to our accumulators anyway, like chumps. - - p. - Since we’re not chumps, we’ll make the obvious improvement. We’ll maintain - another dictionary that tracks how long each weight has gone unchanged. Now - when we do change a weight, we can do a fast-forwarded update to the accumulator, - for all those iterations where it lay unchanged. - - p. - Here’s what a weight update looks like now that we have to maintain the - totals and the time-stamps: - - pre.language-python - code - | def update(self, truth, guess, features): - | def upd_feat(c, f, v): - | nr_iters_at_this_weight = self.i - self._timestamps[f][c] - | self._totals[f][c] += nr_iters_at_this_weight * self.weights[f][c] - | self.weights[f][c] += v - | self._timestamps[f][c] = self.i - - | self.i += 1 - | for f in features: - | upd_feat(truth, f, 1.0) - | upd_feat(guess, f, -1.0) - - h3 Features and pre-processing - - p. - The POS tagging literature has tonnes of intricate features sensitive to - case, punctuation, etc. They help on the standard test-set, which is from - Wall Street Journal articles from the 1980s, but I don’t see how they’ll - help us learn models that are useful on other text. - - p. - To help us learn a more general model, we’ll pre-process the data prior - to feature extraction, as follows: - - ul - li All words are lower cased; - li Digits in the range 1800-2100 are represented as !YEAR; - li Other digit strings are represented as !DIGITS - li - | It would be better to have a module recognising dates, phone numbers, - | emails, hash-tags, etc. but that will have to be pushed back into the - | tokenization. - - p. - I played around with the features a little, and this seems to be a reasonable - bang-for-buck configuration in terms of getting the development-data accuracy - to 97% (where it typically converges anyway), and having a smaller memory - foot-print: - - pre.language-python - code - | def _get_features(self, i, word, context, prev, prev2): - | '''Map tokens-in-contexts into a feature representation, implemented as a - | set. If the features change, a new model must be trained.''' - | def add(name, *args): - | features.add('+'.join((name,) + tuple(args))) - - | features = set() - | add('bias') # This acts sort of like a prior - | add('i suffix', word[-3:]) - | add('i pref1', word[0]) - | add('i-1 tag', prev) - | add('i-2 tag', prev2) - | add('i tag+i-2 tag', prev, prev2) - | add('i word', context[i]) - | add('i-1 tag+i word', prev, context[i]) - | add('i-1 word', context[i-1]) - | add('i-1 suffix', context[i-1][-3:]) - | add('i-2 word', context[i-2]) - | add('i+1 word', context[i+1]) - | add('i+1 suffix', context[i+1][-3:]) - | add('i+2 word', context[i+2]) - | return features - - p. - I haven’t added any features from external data, such as case frequency - statistics from the Google Web 1T corpus. I might add those later, but for - now I figured I’d keep things simple. - - h3 What about search? - - p. - The model I’ve recommended commits to its predictions on each word, and - moves on to the next one. Those predictions are then used as features for - the next word. There’s a potential problem here, but it turns out it doesn’t - matter much. It’s easy to fix with beam-search, but I say it’s not really - worth bothering. And it definitely doesn’t matter enough to adopt a slow - and complicated algorithm like Conditional Random Fields. - - p. - Here’s the problem. The best indicator for the tag at position, say, 3 in - a sentence is the word at position 3. But the next-best indicators are the - tags at positions 2 and 4. So there’s a chicken-and-egg problem: we want - the predictions for the surrounding words in hand before we commit to a - prediction for the current word. Here’s an example where search might matter: - - p.example. - Their management plan reforms worked - - p. - Depending on just what you’ve learned from your training data, you can - imagine making a different decision if you started at the left and moved - right, conditioning on your previous decisions, than if you’d started at - the right and moved left. - - p. - If that’s not obvious to you, think about it this way: “worked” is almost - surely a verb, so if you tag “reforms” with that in hand, you’ll have a - different idea of its tag than if you’d just come from “plan“, which you - might have regarded as either a noun or a verb. - - p. - Search can only help you when you make a mistake. It can prevent that error - from throwing off your subsequent decisions, or sometimes your future choices - will correct the mistake. And that’s why for POS tagging, search hardly matters! - Your model is so good straight-up that your past predictions are almost always - true. So you really need the planets to align for search to matter at all. - - p. - And as we improve our taggers, search will matter less and less. Instead - of search, what we should be caring about is multi-tagging. If we let the - model be a bit uncertain, we can get over 99% accuracy assigning an average - of 1.05 tags per word (Vadas et al, ACL 2006). The averaged perceptron is - rubbish at multi-tagging though. That’s its big weakness. You really want - a probability distribution for that. - - p. - One caveat when doing greedy search, though. It’s very important that your - training data model the fact that the history will be imperfect at run-time. - Otherwise, it will be way over-reliant on the tag-history features. Because - the Perceptron is iterative, this is very easy. - - p. - Here’s the training loop for the tagger: - - pre.language-python - code - | def train(self, sentences, save_loc=None, nr_iter=5, quiet=False): - | '''Train a model from sentences, and save it at save_loc. nr_iter - | controls the number of Perceptron training iterations.''' - | self._make_tagdict(sentences, quiet=quiet) - | self.model.classes = self.classes - | prev, prev2 = START - | for iter_ in range(nr_iter): - | c = 0; n = 0 - | for words, tags in sentences: - | context = START + [self._normalize(w) for w in words] + END - | for i, word in enumerate(words): - | guess = self.tagdict.get(word) - | if not guess: - | feats = self._get_features( - | i, word, context, prev, prev2) - | guess = self.model.predict(feats) - | self.model.update(tags[i], guess, feats) - | # Set the history features from the guesses, not the - | # true tags - | prev2 = prev; prev = guess - | c += guess == tags[i]; n += 1 - | random.shuffle(sentences) - | if not quiet: - | print("Iter %d: %d/%d=%.3f" % (iter_, c, n, _pc(c, n))) - | self.model.average_weights() - | # Pickle as a binary file - | if save_loc is not None: - | cPickle.dump((self.model.weights, self.tagdict, self.classes), - | open(save_loc, 'wb'), -1) - p. - Unlike the previous snippets, this one’s literal – I tended to edit the - previous ones to simplify. So if they have bugs, hopefully that’s why! - - p. - At the time of writing, I’m just finishing up the implementation before I - submit a pull request to TextBlob. You can see the rest of the source here: - - ul - li - a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/taggers.py") taggers.py - li - a(href="https://github.com/sloria/textblob-aptagger/blob/master/textblob_aptagger/_perceptron.py") _perceptron.py - - h3 A final comparison… - - p. - Over the years I’ve seen a lot of cynicism about the WSJ evaluation methodology. - The claim is that we’ve just been meticulously over-fitting our methods to this - data. Actually the evidence doesn’t really bear this out. Mostly, if a technique - is clearly better on one evaluation, it improves others as well. Still, it’s - very reasonable to want to know how these tools perform on other text. So I - ran the unchanged models over two other sections from the OntoNotes corpus: - - table - thead - tr - th Tagger - th WSJ - th ABC - th Web - tbody - tr - td Pattern - td 93.5 - td 90.7 - td 88.1 - tr - td NLTK - td 94.0 - td 91.5 - td 88.4 - tr - td PyGreedyAP - td 96.8 - td 94.8 - td 91.8 - - p. - The ABC section is broadcast news, Web is text from the web (blogs etc — I haven’t - looked at the data much). - - p. - As you can see, the order of the systems is stable across the three comparisons, - and the advantage of our Averaged Perceptron tagger over the other two is real - enough. Actually the pattern tagger does very poorly on out-of-domain text. - It mostly just looks up the words, so it’s very domain dependent. I hadn’t - realised it before, but it’s obvious enough now that I think about it. - - p. - We can improve our score greatly by training on some of the foreign data. - The technique described in this paper (Daume III, 2007) is the first thing - I try when I have to do that. - - - footer.meta(role='contentinfo') - a.button.button-twitter(href=urls.share_twitter, title='Share on Twitter', rel='nofollow') Share on Twitter - .discuss - a.button.button-hn(href='#', title='Discuss on Hacker News', rel='nofollow') Discuss on Hacker News - | - a.button.button-reddit(href='#', title='Discuss on Reddit', rel='nofollow') Discuss on Reddit diff --git a/docs/redesign/change_log.jade b/docs/redesign/change_log.jade deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/redesign/comparisons.jade b/docs/redesign/comparisons.jade deleted file mode 100644 index c4434db5c..000000000 --- a/docs/redesign/comparisons.jade +++ /dev/null @@ -1,139 +0,0 @@ -- var urls = {} -- urls.choi_paper = "http://aclweb.org/anthology/P/P15/P15-1038.pdf" -- urls.emnlp_paper = "honnibal_johnson_emnlp2015.pdf" - - -+comparison("NLTK") - p spaCy is: - ul - li.pro 100x faster; - li.pro 50% more accurate; - li.pro Serializes TODO% smaller; - - p spaCy features: - ul - li.pro Integrated word vectors; - li.pro Efficient binary serialization; - - p NLTK features: - ul - li.con Multiple languages; - li.neutral Educational resources - - -//+comparison("Pattern") -+comparison("CoreNLP") - p spaCy is: - - ul - li.pro TODO% faster; - li.pro TODO% more accurate; - li.pro Not Java; - li.pro Well documented; - li.pro Cheaper to license commercially; - li.neutral - | Opinionated/Minimalist. spaCy avoids providing redundant or overlapping - | options. - - p CoreNLP features: - - ul - li.con Multiple Languages; - li.con Sentiment analysis - li.con Coreference resolution - - -+comparison("ClearNLP") - p spaCy is: - - ul - li.pro Not Java; - li.pro TODO% faster; - li.pro Well documented; - li.neutral Slightly more accurate; - - p ClearNLP features: - - ul - li.con Semantic Role Labelling - li.con Multiple Languages - li.con Model for biology/life-science; - -//+comparison("Accuracy Summary") - -//+comparison("Speed Summary") -// table -// thead -// tr -// th. -// th(colspan=3) Absolute (ms per doc) -// th(colspan=3) Relative (to spaCy) -// -// tbody -// tr -// td: strong System -// td: strong Split -// td: strong Tag -// td: strong Parse -// td: strong Split -// td: strong Tag -// td: strong Parse -// -// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") -// +row("spaCy", "0.2ms", "1ms", "19ms", "1x", "1x", "1x") -// +row("CoreNLP", "2ms", "10ms", "49ms", "10x", "10x", "2.6x") -// +row("ZPar", "1ms", "8ms", "850ms", "5x", "8x", "44.7x") -// +row("NLTK", "4ms", "443ms", "n/a", "20x", "443x", "n/a") -// -// p -// | Set up: 100,000 plain-text documents were streamed -// | from an SQLite3 database, and processed with an NLP library, to one -// | of three levels of detail – tokenization, tagging, or parsing. -// | The tasks are additive: to parse the text you have to tokenize and -// | tag it. The pre-processing was not subtracted from the times – -// | I report the time required for the pipeline to complete. I report -// | mean times per document, in milliseconds. -// -// p -// | Hardware: Intel i7-3770 (2012) - - - - - -+comparison("Peer-reviewed Evaluations") - p. - spaCy is committed to rigorous evaluation under standard methodology. Two - papers in 2015 confirm that: - ol - li spaCy is the fastest syntactic parser in the world; - li Its accuracy is within 1% of the best available; - li The few systems that are more accurate are 20× slower or more. - - p - | spaCy v0.84 was evaluated by researchers at Yahoo! Labs and Emory University, - | as part of a survey paper benchmarking the current state-of-the-art dependency - | parsers - a(href=urls.choi_paper) (Choi et al., 2015) - | . - - table - thead - +columns("System", "Language", "Accuracy", "Speed") - - tbody - +row("spaCy v0.84", "Cython", "90.6", "13,963") - +row("spaCy v0.89", "Cython", "91.8", "13,000 (est.)") - +row("ClearNLP", "Java", "91.7", "10,271") - +row("CoreNLP", "Java", "89.6", "8,602") - +row("MATE", "Java", "92.5", "550") - +row("Turbo", "C++", "92.4", "349") - +row("Yara", "Java", "92.3", "340") - - p - | Discussion with the authors led to accuracy improvements in spaCy, which - | have been accepted for publication in EMNLP, in joint work with Macquarie - | University - a(href=urls.emnlp_paper) (Honnibal and Johnson, 2015) - | . - diff --git a/docs/redesign/docs.jade b/docs/redesign/docs.jade deleted file mode 100644 index e098bb0c0..000000000 --- a/docs/redesign/docs.jade +++ /dev/null @@ -1,129 +0,0 @@ -extends ./outline.jade - -include ./mixins.jade - - -mixin declare_class(name) - details - summary - span.declaration - span.label class - code #{name} - block - -mixin method(name, parameters) - details(open=attributes.open) - summary - span.declaration - span.label #{name} - span.parameters - | self, #{parameters} - block - - -mixin params - ul - block - - -mixin param(name, type, value) - li - if type - #{name} (!{type}) – - else - #{name} – - block - - -mixin attribute(name, type, value) - details(open=attributes.open) - summary - span.declaration - span.label #{name} - block - - -mixin returns(name, type, value) - li - if type - #{name} (!{type}) – - else - #{name} – - block - - -mixin returns(type) - | tmp - -mixin init - details - summary: h4 Init - - block - - -mixin callable - details - summary: h4 Callable - - block - - -mixin sequence - details - summary: h4 Sequence - - block - - -mixin maptype - details - summary: h4 Map - - block - - -mixin summary - block - -mixin en_example - pre.language-python - code - | from spacy.en import English - | from spacy._doc_examples import download_war_and_peace - | - | unprocessed_unicode = download_war_and_peace() - | - | nlp = English() - | doc = nlp(unprocessed_unicode) - - -block intro_block - section(class="intro") - - nav(role="navigation") - ul - li: a(href="#api" class="button") API - li: a(href="#tutorials" class="button") Tutorials - li: a(href="#spec" class="button") Spec - - -block body_block - - var py_docs = 'unicode', - 'bool': py_docs + 'functions.html#bool">bool', - 'int': py_docs + 'functions.html#int">int', - 'generator': "", - 'Vocab': "", - 'Span': "", - 'Doc': "" - } - - article - - +Section("API", "api", "api.jade") - +Section("Tutorials", "tutorials", "tutorials.jade") - +Section("Annotation Specifications", "spec", "spec.jade") diff --git a/docs/redesign/home.jade b/docs/redesign/home.jade deleted file mode 100644 index 66efd1455..000000000 --- a/docs/redesign/home.jade +++ /dev/null @@ -1,88 +0,0 @@ -extends ./outline.jade - -include ./mixins.jade - -// Notes -// -// 1. Where to put version notice? Should say something like -// 2015-08-12: v0.89 -// and be a link -// -// Only needs to appear on home page. - - -- var slogan = "Build Tomorrow's Language Technologies" -- var tag_line = "spaCy – " + slogan - -mixin lede - - var state_of_the_art = 'state-of-the-art' - - var a_minor_miracle = 'a minor miracle' - - var great_documentation = 'great documentation' - - var concise_API = 'concise API' - - p. - spaCy is a - library for industrial-strength natural language processing in Python and - Cython. It features !{state_of_the_art} speed and accuracy, a !{concise_API}, - and license terms designed to get out of your way. - If you're a small company doing NLP, we want spaCy to seem - like !{a_minor_miracle}. - - -mixin comparison(name) - details - summary - h4= name - - block - -mixin columns(...names) - tr - each name in names - th= name - - -mixin row(...cells) - tr - each cell in cells - td= cell - - -mixin social - footer(role="contentinfo") - a(href="http://twitter.com/share?text=[ARTICLE HEADLINE]&url=[ARTICLE LINK]&via=honnibal" title="Share on Twitter" rel="nofollow" class="button button-twitter") Share on Twitter - - div.discuss - a(href="#" title="Discuss on Hacker News" rel="nofollow" class="button button-hn") - | Discuss on Hacker News - - a(href="#" title="Discuss on Reddit" rel="nofollow" class="button button-reddit") - | Discuss on Reddit - - -block intro_block - section(class="intro") - +lede - - nav(role="navigation") - ul - li: a(href="#example-use" class="button") Examples - li: a(href="#comparisons" class="button") Comparisons - li: a(href="#online-demo" class="button") Try Online - li: a(href="#install" class="button") - | Install - v0.89 - - - -block body_block - article(class="page landing-page") - - +Section("Usage by Example", "example-use", "./usage_examples.jade") - - +Section("Comparisons and Benchmarks", "comparisons", "./comparisons.jade") - - +Section("Online Demo", "online-demo", "./online_demo.jade") - - - +Section("Install", "install", "./install.jade") diff --git a/docs/redesign/installation.jade b/docs/redesign/installation.jade deleted file mode 100644 index 50736e0ff..000000000 --- a/docs/redesign/installation.jade +++ /dev/null @@ -1,71 +0,0 @@ -mixin Option(name, open) - details(open=open) - summary - h4= name - block - -+Option("conda", true) - pre.language-bash: code - | $ conda install spacy - | $ python -m spacy.en.download - -+Option("pip and virtualenv", true) - p With Python 2.7 or Python 3, using Linux or OSX, run: - - pre.language-bash: code - | $ pip install spacy - | $ python -m spacy.en.download - - p - | The download command fetches and installs about 300mb of data, for - | the parser model and word vectors, which it installs within the spacy.en - | package directory. - - - +Option("Workaround for obsolete system Python", false) - p - | If you're stuck using a server with an old version of Python, and you - | don't have root access, I've prepared a bootstrap script to help you - | compile a local Python install. Run: - - pre.language-bash: code - | $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate - - - -+Option("Compile from source", false) - p - | The other way to install the package is to clone the github repository, - | and build it from source. This installs an additional dependency, - | Cython. If you're using Python 2, I also recommend installing fabric - | and fabtools – this is how I build the project. - - pre.language-bash: code - | $ git clone https://github.com/honnibal/spaCy.git - | $ cd spaCy - | $ virtualenv .env && source .env/bin/activate - | $ export PYTHONPATH=`pwd` - | $ pip install -r requirements.txt - | $ python setup.py build_ext --inplace - | $ python -m spacy.en.download - | $ pip install pytest - | $ py.test tests/ - - p - | Python packaging is awkward at the best of times, and it's particularly tricky - | with C extensions, built via Cython, requiring large data files. So, - | please report issues as you encounter them. - -+Option("pypy (Unsupported)") - | If PyPy support is a priority for you, please get in touch. We could likely - | fix the remaining issues, if necessary. However, the library is likely to - | be much slower on PyPy, as it's written in Cython, which produces code tuned - | for the performance of CPython. - -+Option("Windows (Unsupported)") - | Unfortunately we don't currently have access to a Windows machine, and have - | no experience developing on a MicroSoft stack. In theory the only problems are - | with the installation and packaging – there should be no deep platform - | dependency. Unfortunately we can't debug these issues at present, simply due - | to lack of a development environment. - diff --git a/docs/redesign/license.jade b/docs/redesign/license.jade deleted file mode 100644 index d8dc2135e..000000000 --- a/docs/redesign/license.jade +++ /dev/null @@ -1,179 +0,0 @@ -extends ./outline.jade - -mixin columns(...names) - tr - each name in names - th= name - - -mixin row(...cells) - tr - each cell in cells - td= cell - - -mixin LicenseOption(name, period, price, audience) - .item - h4 #{name} - - .focus #{period} - - span #{price} - - h5 Suggested for: - - span #{audience} - - a.button(href="spacy_trial_free.docx") Download license - - span or - a(href="#") get in touch - - -block body_block - article.pricing - - .box.license - +LicenseOption("Trial", "90 days", "$0", "Evaluation") - +LicenseOption("Production", "1 year", "$5,000", "Production") - +LicenseOption("Certainty", "5 years", "$20,000", "Secure Planning") - - p.caption - | Researcher, hobbyist, or open-source developer? spaCy also offers - a(href="http://www.gnu.org/licenses/agpl-3.0.en.html") AGPLv3 - | licenses. - - p. - What we offer is a rare, simple certainty: a long-term, permissive license - that comes with full access to the source, complete transparency, and almost - complete flexibility. The difference between this and a black-box API is - night and day. You cannot build a great product against a service you - don't understand, and you can't build a great business on a service you - don't control. - - p - | Let's face it: services disappear. Constantly. The good start-ups get - | bought; the bad ones go bankrupt. Open-source projects become abandoned - | or bloated. Google's graveyard is over-flowing – ditto for Yahoo!, - | Microsoft, etc. Sure, IBM won't be broke...But will BlueMix be sunset? - - p - | A 5 year license won't expire until 2020. spaCy will be with you for - | longer than most of your current staff. If that's still not enough, - | get in touch. I'm sure we can work something out. - - //p. - // To make spaCy as valuable as possible, licenses to it are for life. You get - // complete transparency, certainty and control. If you need to use spaCy - // as an API, it's trivial to host it yourself – and you don't need to - // worry about the service changing or disappearing. And if you're ever in - // acquisition or IPO talks, the story is simple. - - //p. - // spaCy can also be used as free open-source software, under the Aferro GPL - // license. If you use it this way, you must comply with the AGPL license - // terms. When you distribute your project, or offer it as a network service, - // you must distribute the source-code and grant users an AGPL license to it. - - - //h3 Examples - - //p. - // In order to clarify how spaCy's license structure might apply to you, I've - // written a few examples, in the form of user-stories. - - //details - // summary: h4 Seed stage start-ups - - // p. - // Ashley and Casey have an idea for a start-up. To explore their idea, they - // want to build a minimum viable product they can put in front of potential - // users and investors. - - // p. They have two options. - - // ol - // li - // p. - // Trial commercial license. With a simple form, they can - // use spaCy for 90 days, for a nominal fee of $1. They are free to modify - // spaCy, and they will own the copyright to their modifications for the - // duration of the license. After the trial period elapses, they can either - // pay the license fee, stop using spaCy, release their project under the - // AGPL. - // - // li - // p. - // AGPL. Casey and Pat can instead use spaCy under the AGPL - // license. However, they must then release any code that statically or - // dynamically links to spaCy under the AGPL as well (e.g. if they import - // the module, or import a module that imports it, etc). They also cannot - // use spaCy as a network resource, by running it as a service --- this is - // the loophole that the "A" part of the AGPL is designed to close. - // - // p. - // Ashley and Casey find the AGPL license unattractive for commercial use. - // They decide to take up the trial commercial license. However, over the - // next 90 days, Ashley has to move house twice, and Casey gets sick. By - // the time the trial expires, they still don't have a demo they can show - // investors. They send an email explaining the situation, and a 90 day extension - // to their trial license is granted. - - // p. - // By the time the extension period has elapsed, spaCy has helped them secure - // funding, and they even have a little revenue. They are glad to pay the - // $5,000 commercial license fee. - - // p. - // spaCy is now permanently licensed for the product Ashley and Casey are - // developing. They own the copyright to any modifications they make to spaCy, - // but not to the original spaCy code. - - // p. - // No additional fees will be due when they hire new developers, run spaCy on - // additional internal servers, etc. If their company is acquired, the license - // will be transferred to the company acquiring them. However, to use spaCy - // in another product, they will have to buy a second license. - - - // details - // summary: h4 University academics - - // p. - // Alex and Sasha are post-doctoral researchers working for a university. - // Part of their funding comes from a grant from Google, but Google will not - // own any part of the work that they produce. Their mission is just to write - // papers. - - // p. - // Alex and Sasha find spaCy convenient, so they use it in their system under - // the AGPL. This means that their system must also be released under the - // AGPL, but they're cool with that – they were going to release their - // code anyway, as it's the only way to ensure their experiments are properly - // repeatable. - - // p. - // Alex and Sasha find and fix a few bugs in spaCy. They must release these - // modifications, and they ask that they be accepted into the main spaCy repo. - // In order to do this, they must sign a contributor agreement, ceding their - // copyright. When commercial licenses to spaCy are sold, Alex and Sasha will - // not be able to claim any royalties from their contributions. - - // p. - // Later, Alex and Sasha implement new features into spaCy, for another paper. - // The code was quite rushed, and they don't want to take the time to put - // together a proper pull request. They must release their modifications - // under the AGPL, but they are not obliged to contribute it to the spaCy - // repository, or concede their copyright. - - // details - // summary: h4 Open Source developers - - // p. - // Phuong and Jessie use the open-source software Calibre to manage their - // e-book libraries. They have an idea for a search feature, and they want - // to use spaCy to implement it. Calibre is released under the GPLv3. The - // AGPL has additional restrictions for projects used as a network resource, - // but they don't apply to this project, so Phuong and Jessie can use spaCy - // to improve Calibre. They'll have to release their code, but that was - // always their intention anyway. diff --git a/docs/redesign/mixins.jade b/docs/redesign/mixins.jade deleted file mode 100644 index 005149a2b..000000000 --- a/docs/redesign/mixins.jade +++ /dev/null @@ -1,17 +0,0 @@ -mixin Section(title_text, link_name, include_file) - h3: a(name=link_name) #{title_text} - - if (link_name == "example-use") - include ./usage_examples.jade - else if (link_name == "online-demo") - include ./online_demo.jade - else if (link_name == "comparisons") - include ./comparisons.jade - else if (link_name == "install") - include ./installation.jade - else if (link_name == "api") - include ./api.jade - else if (link_name == "tutorials") - include ./tutorials.jade - else if (link_name == "spec") - include ./spec.jade diff --git a/docs/redesign/online_demo.jade b/docs/redesign/online_demo.jade deleted file mode 100644 index 92a61eefc..000000000 --- a/docs/redesign/online_demo.jade +++ /dev/null @@ -1,18 +0,0 @@ -mixin Displacy(sentence, caption_text, height) - - var url = "http://ines.io/displacy/?full=" + sentence.replace(" ", "%20") - - .displacy - iframe.displacy(src="displacy/displacy_demo.html" height=height) - - a.view-displacy(href=url) - | Interactive Visualizer - - p.caption. - #{caption_text} - - -+Displacy( - "Click the button to see this sentence in displaCy.", - "The best parse-tree visualizer and annotation tool in all the land.", - 275 -) diff --git a/docs/redesign/outline.jade b/docs/redesign/outline.jade deleted file mode 100644 index 1ae9eacfa..000000000 --- a/docs/redesign/outline.jade +++ /dev/null @@ -1,37 +0,0 @@ -- var slogan = "Build Tomorrow's Language Technologies" -- var tag_line = "spaCy – " + slogan - - -doctype html -html(lang="en") - head - meta(charset="utf-8") - title!= tag_line - meta(name="description" content="") - meta(name="author" content="Matthew Honnibal") - link(rel="stylesheet" href="css/style.css") - - - body(id="home" role="document") - header(role="banner") - h1(class="logo")!= tag_line - div(class="slogan")!= slogan - - nav(role="navigation") - ul - li: a(href="home.html") Home - li: a(href="docs.html") Docs - li: a(href="license.html") License - li: a(href="blog.html") Blog - - main(id="content" role="main") - block intro_block - - block body_block - - footer(role="contentinfo") - - script(src="js/prism.js") - script(src="js/details_polyfill.js") diff --git a/docs/redesign/spec.jade b/docs/redesign/spec.jade deleted file mode 100644 index 4c459f409..000000000 --- a/docs/redesign/spec.jade +++ /dev/null @@ -1,129 +0,0 @@ -mixin columns(...names) - tr - each name in names - th= name - - -mixin row(...cells) - tr - each cell in cells - td= cell - - -details - summary: h4 Overview - - p. - This document describes the target annotations spaCy is trained to predict. - This is currently a work in progress. Please ask questions on the issue tracker, - so that the answers can be integrated here to improve the documentation. - -details - summary: h4 Tokenization - - p Tokenization standards are based on the OntoNotes 5 corpus. - - p. - The tokenizer differs from most by including tokens for significant - whitespace. Any sequence of whitespace characters beyond a single space - (' ') is included as a token. For instance: - - pre.language-python - code - | from spacy.en import English - | nlp = English(parse=False) - | tokens = nlp('Some\nspaces and\ttab characters') - | print([t.orth_ for t in tokens]) - - p Which produces: - - pre.language-python - code - | ['Some', '\n', 'spaces', ' ', 'and', '\t', 'tab', 'characters'] - - p. - The whitespace tokens are useful for much the same reason punctuation is - – it's often an important delimiter in the text. By preserving - it in the token output, we are able to maintain a simple alignment - between the tokens and the original string, and we ensure that no - information is lost during processing. - -details - summary: h4 Sentence boundary detection - - p. - Sentence boundaries are calculated from the syntactic parse tree, so - features such as punctuation and capitalisation play an important but - non-decisive role in determining the sentence boundaries. Usually this - means that the sentence boundaries will at least coincide with clause - boundaries, even given poorly punctuated text. - -details - summary: h4 Part-of-speech Tagging - - p. - The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank - tag set. We also map the tags to the simpler Google Universal POS Tag set. - - p. - Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - -details - summary: h4 Lemmatization - - p. - A "lemma" is the uninflected form of a word. In English, this means: - - ul - li Adjectives: The form like "happy", not "happier" or "happiest" - li Adverbs: The form like "badly", not "worse" or "worst" - li Nouns: The form like "dog", not "dogs"; like "child", not "children" - li Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - - p. - The lemmatization data is taken from WordNet. However, we also add a - special case for pronouns: all pronouns are lemmatized to the special - token -PRON-. - - -details - summary: h4 Syntactic Dependency Parsing - - p. - The parser is trained on data produced by the ClearNLP converter. Details - of the annotation scheme can be found here: http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - -details - summary: h4 Named Entity Recognition - - table - thead - +columns("Entity Type", "Description") - - tbody - +row("PERSON", "People, including fictional.") - +row("NORP", "Nationalities or religious or political groups.") - +row("FACILITY", "Buildings, airports, highways, bridges, etc.") - +row("ORG", "Companies, agencies, institutions, etc.") - +row("GPE", "Countries, cities, states.") - +row("LOC", "Non-GPE locations, mountain ranges, bodies of water.") - +row("PRODUCT", "Vehicles, weapons, foods, etc. (Not services") - +row("EVENT", "Named hurricanes, battles, wars, sports events, etc.") - +row("WORK_OF_ART", "Titles of books, songs, etc.") - +row("LAW", "Named documents made into laws") - +row("LANGUAGE", "Any named language") - - p The following values are also annotated in a style similar to names: - - table - thead - +columns("Entity Type", "Description") - - tbody - +row("DATE", "Absolute or relative dates or periods") - +row("TIME", "Times smaller than a day") - +row("PERCENT", 'Percentage (including “%”)') - +row("MONEY", "Monetary values, including unit") - +row("QUANTITY", "Measurements, as of weight or distance") - +row("ORDINAL", 'first", "second"') - +row("CARDINAL", "Numerals that do not fall under another type") diff --git a/docs/redesign/template_post.jade b/docs/redesign/template_post.jade deleted file mode 100644 index 0012d24b7..000000000 --- a/docs/redesign/template_post.jade +++ /dev/null @@ -1,31 +0,0 @@ -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog(role="document") - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - - nav(role="navigation") - ul - li: a(href="home.html") Home - li: a(href="docs.html") Docs - li.active: a(href="blog.html") Blog - li: a(href="license.html") License - - main#content(role='main') - block intro_block - - block body_block - - footer(role='contentinfo') - - script(src="js/prism.js") - script(src="js/details_polyfill.js") diff --git a/docs/redesign/tute_adverbs.jade b/docs/redesign/tute_adverbs.jade deleted file mode 100644 index c81931b0e..000000000 --- a/docs/redesign/tute_adverbs.jade +++ /dev/null @@ -1,200 +0,0 @@ -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - main#content(role='main') - article.post - - - :markdown-it - # Adverbs - - Let's say you're developing a proofreading tool, or possibly an IDE for - writers. You're convinced by Stephen King's advice that `adverbs are - not your friend `_, - so you want to **highlight all adverbs**. We'll use one of the examples - he finds particularly egregious: - - pre.language-python - code - | import spacy.en - | >>> from spacy.parts_of_speech import ADV - | >>> # Load the pipeline, and call it with some text. - | >>> nlp = spacy.en.English() - | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False) - | >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - | u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ - - :markdown-it - Easy enough --- but the problem is that we've also highlighted "back". - While "back" is undoubtedly an adverb, we probably don't want to highlight - it. If what we're trying to do is flag dubious stylistic choices, we'll - need to refine our logic. It turns out only a certain type of adverb - is of interest to us. - - - :markdown-it - There are lots of ways we might do this, depending on just what words - we want to flag. The simplest way to exclude adverbs like "back" and - "not" is by word frequency: these words are much more common than the - prototypical manner adverbs that the style guides are worried about. - - :markdown-it - The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a - log probability estimate of the word: - - pre.language-python - code - | >>> nlp.vocab[u'back'].prob - | -7.403977394104004 - | >>> nlp.vocab[u'not'].prob - | -5.407193660736084 - | >>> nlp.vocab[u'quietly'].prob - | -11.07155704498291 - - :markdown-it - (The probability estimate is based on counts from a 3 billion word corpus, - smoothed using the `Simple Good-Turing`_ method.) - - So we can easily exclude the N most frequent words in English from our - adverb marker. Let's try N=1000 for now: - - pre.language-python - code - | >>> import spacy.en - | >>> from spacy.parts_of_speech import ADV - | >>> nlp = spacy.en.English() - | >>> # Find log probability of Nth most frequent word - | >>> probs = [lex.prob for lex in nlp.vocab] - | >>> probs.sort() - | >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - | >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - | >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - | ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ - - :markdown-it - There are lots of other ways we could refine the logic, depending on - just what words we want to flag. Let's say we wanted to only flag - adverbs that modified words similar to "pleaded". This is easy to do, - as spaCy loads a vector-space representation for every word (by default, - the vectors produced by `Levy and Goldberg (2014)`_). Naturally, the - vector is provided as a numpy array: - - pre.language-python - code - | >>> pleaded = tokens[7] - | >>> pleaded.repvec.shape - | (300,) - | >>> pleaded.repvec[:5] - | array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32) - - :markdown-it - We want to sort the words in our vocabulary by their similarity to - "pleaded". There are lots of ways to measure the similarity of two - vectors. We'll use the cosine metric: - - pre.language-python - code - | >>> from numpy import dot - | >>> from numpy.linalg import norm - - | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - | >>> words = [w for w in nlp.vocab if w.has_repvec] - | >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - | >>> words.reverse() - | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) - | 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading - | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) - | 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses - | >>> print('100-110', ', '.join(w.orth_ for w in words[100:110])) - | 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes - | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) - | 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged - | >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010])) - | 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists - - :markdown-it - As you can see, the similarity model that these vectors give us is excellent - --- we're still getting meaningful results at 1000 words, off a single - prototype! The only problem is that the list really contains two clusters of - words: one associated with the legal meaning of "pleaded", and one for the more - general sense. Sorting out these clusters is an area of active research. - - A simple work-around is to average the vectors of several words, and use that - as our target: - - pre.language-python - code - | >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] - | >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) - | >>> words.sort(key=lambda w: cosine(w.repvec * say_vector)) - | >>> words.reverse() - | >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) - | 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired - | >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) - | 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed - | >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) - | 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate - - :markdown-it - These definitely look like words that King might scold a writer for attaching - adverbs to. Recall that our original adverb highlighting function looked like - this: - - pre.language-python - code - | >>> import spacy.en - | >>> from spacy.parts_of_speech import ADV - | >>> # Load the pipeline, and call it with some text. - | >>> nlp = spacy.en.English() - | >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - | tag=True, parse=False) - | >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) - | ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ - - - :markdown-it - We wanted to refine the logic so that only adverbs modifying evocative - verbs of communication, like "pleaded", were highlighted. We've now - built a vector that represents that type of word, so now we can highlight - adverbs based on subtle logic, honing in on adverbs that seem the most - stylistically problematic, given our starting assumptions: - - pre.language-python - code - | >>> import numpy - | >>> from numpy import dot - | >>> from numpy.linalg import norm - | >>> import spacy.en - | >>> from spacy.parts_of_speech import ADV, VERB - | >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - | >>> def is_bad_adverb(token, target_verb, tol): - | ... if token.pos != ADV - | ... return False - | ... elif token.head.pos != VERB: - | ... return False - | ... elif cosine(token.head.repvec, target_verb) < tol: - | ... return False - | ... else: - | ... return True - - :markdown-it - This example was somewhat contrived --- and, truth be told, I've never - really bought the idea that adverbs were a grave stylistic sin. But - hopefully it got the message across: the state-of-the-art NLP technologies - are very powerful. spaCy gives you easy and efficient access to them, - which lets you build all sorts of useful products and features that - were previously impossible. - - footer(role='contentinfo') - script(src='js/prism.js') diff --git a/docs/redesign/tute_syntax_search.jade b/docs/redesign/tute_syntax_search.jade deleted file mode 100644 index c3679b83d..000000000 --- a/docs/redesign/tute_syntax_search.jade +++ /dev/null @@ -1,132 +0,0 @@ -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - main#content(role='main') - section.intro - p - | Example use of the spaCy NLP tools for data exploration. - | Here we will look for reddit comments that describe Google doing something, - | i.e. discuss the company's actions. This is difficult, because other senses of - | "Google" now dominate usage of the word in conversation, particularly references to - | using Google products. - - p - | The heuristics used are quick and dirty – about 5 minutes work. - - //| A better approach is to use the word vector of the verb. But, the - // | demo here is just to show what's possible to build up quickly, to - // | start to understand some data. - - article.post - header - h2 Syntax-specific Search - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2015-08-14') August - - details - summary: h4 Imports - - pre.language-python - code - | from __future__ import unicode_literals - | from __future__ import print_function - | import sys - | - | import plac - | import bz2 - | import ujson - | import spacy.en - - details - summary: h4 Load the model and iterate over the data - - pre.language-python - code - | def main(input_loc): - | nlp = spacy.en.English() # Load the model takes 10-20 seconds. - | for line in bz2.BZ2File(input_loc): # Iterate over the reddit comments from the dump. - | comment_str = ujson.loads(line)['body'] # Parse the json object, and extract the 'body' attribute. - | - details - summary: h4 Apply the spaCy NLP pipeline, and look for the cases we want - - pre.language-python - code - | comment_parse = nlp(comment_str) - | for word in comment_parse: - | if google_doing_something(word): - | # Print the clause - | print(''.join(w.string for w in word.head.subtree).strip()) - details - summary: h4 Define the filter function - - pre.language-python - code - - | - | def google_doing_something(w): - | if w.lower_ != 'google': - | return False - | # Is it the subject of a verb? - | elif w.dep_ != 'nsubj': - | return False - | # And not 'is' - | elif w.head.lemma_ == 'be' and w.head.dep_ != 'aux': - | return False - | # Exclude e.g. "Google says..." - | elif w.head.lemma_ in ('say', 'show'): - | return False - | else: - | return True - | - | - - details - summary: h4 Call main - - pre.language-python - code - | if __name__ == '__main__': - | plac.call(main) - - details - summary: h4 Example output - - p. - Many false positives remain. Some are from incorrect interpretations - of the sentence by spaCy, some are flaws in our filtering logic. But - the results are vastly better than a string-based search, which returns - almost no examples of the pattern we're looking for. - - code - | Google dropped support for Android < 4.0 already - | google drive - | Google to enforce a little more uniformity in its hardware so that we can see a better 3rd party market for things like mounts, cases, etc - | When Google responds - | Google translate cyka pasterino. - | A quick google looks like Synology does have a sync'ing feature which does support block level so that should work - | (google came up with some weird One Piece/FairyTail crossover stuff), and is their knowledge universally infallible? - | Until you have the gear, google some videos on best farming runs on each planet, you can get a lot REAL fast with the right loop. - | Google offers something like this already, but it is truly terrible. - | google isn't helping me - | Google tells me: 0 results, 250 pages removed from google. - | how did Google swoop in and eat our lunch - - - - script(src="js/prism.js") - script(src="js/details_polyfill.js") diff --git a/docs/redesign/tute_twitter.jade b/docs/redesign/tute_twitter.jade deleted file mode 100644 index f8f849eed..000000000 --- a/docs/redesign/tute_twitter.jade +++ /dev/null @@ -1,204 +0,0 @@ -doctype html -html(lang='en') - head - meta(charset='utf-8') - title spaCy Blog - meta(name='description', content='') - meta(name='author', content='Matthew Honnibal') - link(rel='stylesheet', href='css/style.css') - //if lt IE 9 - script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js') - body#blog - header(role='banner') - h1.logo spaCy Blog - .slogan Blog - main#content(role='main') - article.post - header - h2 Finding Relevant Tweets - .subhead - | by - a(href='#', rel='author') Matthew Honnibal - | on - time(datetime='2015-08-14') December - - details - summary: h4 Imports - pre.language-python - - | from __future__ import unicode_literals, print_function - | import plac - | import codecs - | import sys - | import math - | - | import spacy.en - | from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ - | - | from termcolor import colored - | from twython import TwythonStreamer - | - | from os import path - | from math import sqrt - | - | from numpy import dot - | from numpy.linalg import norm - | - | - - details - summary: h4 Simple vector-averaging similarity - - pre.language-python: code - - | class Meaning(object): - | def __init__(self, vectors): - | if vectors: - | self.vector = sum(vectors) / len(vectors) - | self.norm = norm(self.vector) - | else: - | self.vector = None - | self.norm = 0 - | - | @classmethod - | def from_path(cls, nlp, loc): - | with codecs.open(loc, 'r', 'utf8') as file_: - | terms = file_.read().strip().split() - | return cls.from_terms(nlp, terms) - | - | @classmethod - | def from_tokens(cls, nlp, tokens): - | vectors = [t.repvec for t in tokens] - | return cls(vectors) - | - | @classmethod - | def from_terms(cls, nlp, examples): - | lexemes = [nlp.vocab[eg] for eg in examples] - | vectors = [eg.repvec for eg in lexemes] - | return cls(vectors) - | - | def similarity(self, other): - | if not self.norm or not other.norm: - | return -1 - | return dot(self.vector, other.vector) / (self.norm * other.norm) - | - - details - summary: h4 Print matches - - pre.language-python: code - - | - | def print_colored(model, stream=sys.stdout): - | if model['is_match']: - | color = 'green' - | elif model['is_reject']: - | color = 'red' - | else: - | color = 'grey' - | - | if not model['is_rare'] and model['is_match'] and not model['is_reject']: - | match_score = colored('%.3f' % model['match_score'], 'green') - | reject_score = colored('%.3f' % model['reject_score'], 'red') - | prob = '%.5f' % model['prob'] - | - | print(match_score, reject_score, prob) - | print(repr(model['text']), color) - | print('') - | - | - - details - summary: h4 TextMatcher: Process the tweets using spaCy - - pre.language-python: code - - | class TextMatcher(object): - | def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): - | self.nlp = nlp - | self.get_target = get_target - | self.get_reject = get_reject - | self.min_prob = min_prob - | self.min_match = min_match - | self.max_reject = max_reject - | - | def __call__(self, text): - | tweet = self.nlp(text) - | target_terms = self.get_target() - | reject_terms = self.get_reject() - | - | prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) - | meaning = Meaning.from_tokens(self, tweet) - | - | match_score = meaning.similarity(self.get_target()) - | reject_score = meaning.similarity(self.get_reject()) - | return { - | 'text': tweet.string, - | 'prob': prob, - | 'match_score': match_score, - | 'reject_score': reject_score, - | 'is_rare': prob < self.min_prob, - | 'is_match': prob >= self.min_prob and match_score >= self.min_match, - | 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject - | } - | - | - - details - summary: h4 Connect to Twitter and stream tweets - - pre.language-python: code - - | class Connection(TwythonStreamer): - | def __init__(self, keys_dir, handler, view): - | keys = Secrets(keys_dir) - | TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) - | self.handler = handler - | self.view = view - | - | def on_success(self, data): - | text = data.get('text', u'') - | # Twython returns either bytes or unicode, depending on tweet. - | # #APIshaming - | try: - | model = self.handler(text) - | except TypeError: - | model = self.handler(text.decode('utf8')) - | status = self.view(model, sys.stdin) - | - | def on_error(self, status_code, data): - | print(status_code) - | - | - | class Secrets(object): - | def __init__(self, key_dir): - | self.key = open(path.join(key_dir, 'key.txt')).read().strip() - | self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() - | self.token = open(path.join(key_dir, 'token.txt')).read().strip() - | self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() - | - | - - details - summary: h4 Command-line interface - - pre.language-python: code - - | def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): - | # We don't need the parser for this demo, so may as well save the loading time - | nlp = spacy.en.English(Parser=None) - | get_target = lambda: Meaning.from_path(nlp, target_loc) - | get_reject = lambda: Meaning.from_path(nlp, reject_loc) - | matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) - | - | twitter = Connection(keys_dir, matcher, print_colored) - | twitter.statuses.filter(track=term) - | - | - | if __name__ == '__main__': - | plac.call(main) - | - - footer(role='contentinfo') - script(src='js/prism.js') - diff --git a/docs/redesign/tutorials.jade b/docs/redesign/tutorials.jade deleted file mode 100644 index ad1a4dbc9..000000000 --- a/docs/redesign/tutorials.jade +++ /dev/null @@ -1,29 +0,0 @@ -mixin Tutorial(title) - details - summary - h4= title - - block - -+Tutorial("Mark-up all manner adverbs, especially for verbs of speech") - | Let's say you're developing a proofreading tool, or possibly an IDE for - | writers. You're convinced by Stephen King's advice that - | adverbs are not your friend - | so you want to - a.readmore(href='tute_adverbs.html') - | highlight all adverbs. ► - -+Tutorial("Search Reddit for comments about Google doing something") - | Example use of the spaCy NLP tools for data exploration. - | Here we will look for Reddit comments that describe Google doing something, - | i.e. discuss the company's actions. This is difficult, because other - | senses of "Google" now dominate usage of the word in conversation, - | particularly references to using Google products. - a.readmore(href='tute_adverbs.html') - | ► - -+Tutorial("Use word vectors for semantic search of Twitter") - | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. - | Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore. - a.readmore(href='tute_twitter.html') - | ► diff --git a/docs/redesign/usage_examples.jade b/docs/redesign/usage_examples.jade deleted file mode 100644 index 04f29eeb9..000000000 --- a/docs/redesign/usage_examples.jade +++ /dev/null @@ -1,167 +0,0 @@ -mixin example(name) - details - summary - h4= name - block - - -+example("Load resources and process text") - pre.language-python: code - | from __future__ import unicode_literals, print_function - | from spacy.en import English - | nlp = English() - | doc = nlp('Hello, world. Here are two sentences.') - -+example("Get tokens and sentences") - pre.language-python: code - | token = doc[0] - | sentence = doc.sents[0] - | assert token[0] is sentence[0] - -+example("Use integer IDs for any string") - pre.language-python: code - | hello_id = nlp.vocab.strings['Hello'] - | hello_str = nlp.vocab.strings[hello_id] - | - | assert token.orth == hello_id == 52 - | assert token.orth_ == hello_str == 'Hello' - -+example("Get and set string views and flags") - pre.language-python: code - | assert token.shape_ == 'Xxxx' - | for lexeme in nlp.vocab: - | if lexeme.is_alpha: - | lexeme.shape_ = 'W' - | elif lexeme.is_digit: - | lexeme.shape_ = 'D' - | elif lexeme.is_punct: - | lexeme.shape_ = 'P' - | else: - | lexeme.shape_ = 'M' - | assert token.shape_ == 'W' - -+example("Export to numpy arrays") - pre.language-python: code - | from spacy.en.attrs import ORTH, LIKE_URL, IS_OOV - | - | attr_ids = [ORTH, LIKE_URL, IS_OOV] - | doc_array = doc.to_array(attr_ids) - | assert doc_array.shape == (len(doc), len(attrs) - | assert doc[0].orth == doc_array[0, 0] - | assert doc[1].orth == doc_array[1, 0] - | assert doc[0].like_url == doc_array[0, 1] - | assert doc_array[, 1] == [t.like_url for t in doc] - -+example("Word vectors") - pre.language-python: code - | doc = nlp("Apples and oranges are similar. Boots and hippos aren't.") - | - | apples = doc[0] - | oranges = doc[1] - | boots = doc[6] - | hippos = doc[8] - | - | assert apples.similarity(oranges) > boots.similarity(hippos) - - -+example("Part-of-speech tags") - pre.language-python: code - | from spacy.parts_of_speech import ADV - | - | def is_adverb(token): - | return token.pos == spacy.parts_of_speech.ADV - | - | # These are data-specific, so no constants are provided. You have to look - | # up the IDs from the StringStore. - | NNS = nlp.vocab.strings['NNS'] - | NNPS = nlp.vocab.strings['NNPS'] - | def is_plural_noun(token): - | return token.tag == NNS or token.tag == NNPS - | - | def print_coarse_pos(token): - | print(token.pos_) - | - | def print_fine_pos(token): - | print(token.tag_) - -+example("Syntactic dependencies") - pre.language-python: code - | def dependency_labels_to_root(token): - | '''Walk up the syntactic tree, collecting the arc labels.''' - | dep_labels = [] - | while token.root is not token: - | dep_labels.append(token.dep) - | token = token.head - | return dep_labels - -+example("Named entities") - pre.language-python: code - | def iter_products(docs): - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PRODUCT': - | yield ent - | - | def word_is_in_entity(word): - | return word.ent_type != 0 - | - | def count_parent_verb_by_person(docs): - | counts = defaultdict(defaultdict(int)) - | for doc in docs: - | for ent in doc.ents: - | if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: - | counts[ent.orth_][ent.root.head.lemma_] += 1 - | return counts - - //+example("Define custom NER rules") - // pre.language-python: code - // | nlp.matcher - - -+example("Calculate inline mark-up on original string") - pre.language-python: code - | def put_spans_around_tokens(doc, get_classes): - | '''Given some function to compute class names, put each token in a - | span element, with the appropriate classes computed. - | - | All whitespace is preserved, outside of the spans. (Yes, I know HTML - | won't display it. But the point is no information is lost, so you can - | calculate what you need, e.g.
tags,

tags, etc.) - | ''' - | output = [] - | template = '<span classes="{classes}">{word}</span>{space}' - | for token in doc: - | if token.is_space: - | output.append(token.orth_) - | else: - | output.append( - | template.format( - | classes=' '.join(get_classes(token)), - | word=token.orth_, - | space=token.whitespace_)) - | string = ''.join(output) - | string = string.replace('\n', '
') - | string = string.replace('\t', '    ' - | return string - - -+example("Efficient binary serialization") - pre.language-python: code - | - | byte_string = doc.as_bytes() - | open('/tmp/moby_dick.bin', 'wb').write(byte_string) - | - | nlp = spacy.en.English() - | for byte_string in Doc.read(open('/tmp/moby_dick.bin', 'rb')): - | doc = Doc(nlp.vocab) - | doc.from_bytes(byte_string) - - -p - | See the - a(href="docs.html") docs page - | for - a(href="docs.html#api") API documentation, - a(href="docs.html#tutorials") tutorials, - | and - a(href="docs.html#spec") annotation specs. From dcc8fadc7e7dffd86e2616dea52f9d718c1795ed Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:10:43 +0200 Subject: [PATCH 074/138] * Add gazetteer-matcher --- tests/parser/test_base_nps.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 tests/parser/test_base_nps.py diff --git a/tests/parser/test_base_nps.py b/tests/parser/test_base_nps.py new file mode 100644 index 000000000..f37c80f07 --- /dev/null +++ b/tests/parser/test_base_nps.py @@ -0,0 +1,40 @@ +import pytest + + +@pytest.mark.models +def test_nsubj(EN): + sent = EN(u'A base phrase should be recognized.') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 1 + assert base_nps[0].string == 'A base phrase ' + + +@pytest.mark.models +def test_coord(EN): + sent = EN(u'A base phrase and a good phrase are often the same.') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 2 + assert base_nps[0].string == 'A base phrase ' + assert base_nps[1].string == 'a good phrase ' + + +@pytest.mark.models +def test_pp(EN): + sent = EN(u'A phrase with another phrase occurs') + base_nps = list(sent.noun_chunks) + assert len(base_nps) == 2 + assert base_nps[0].string == 'A phrase ' + assert base_nps[1].string == 'another phrase ' + + +@pytest.mark.models +def test_merge_pp(EN): + sent = EN(u'A phrase with another phrase occurs') + nps = [(np[0].idx, np[-1].idx + len(np[-1]), np[0].ent_type_) for np in sent.noun_chunks] + + for start, end, ent_type in nps: + sent.merge(start, end, u'NP', np.lemma_, ent_type) + assert sent[0].string == 'A phrase ' + assert sent[1].string == 'with ' + assert sent[2].string == 'another phrase ' + assert sent[3].string == 'occurs' From ffbf9e9ca5cbf334691b4fb7dddbd8861a17ab47 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:11:14 +0200 Subject: [PATCH 075/138] * Remove docs --- docs/Makefile | 177 ----------- docs/source/conf.py | 271 ---------------- docs/source/depr/annotation.rst | 116 ------- docs/source/depr/api.rst | 1 - docs/source/depr/features.rst | 77 ----- docs/source/example_wsj0001.json | 337 -------------------- docs/source/howworks.rst | 262 ---------------- docs/source/index.rst | 339 --------------------- docs/source/license.rst | 126 -------- docs/source/quickstart.rst | 236 -------------- docs/source/reference/annotation.rst | 116 ------- docs/source/reference/index.rst | 112 ------- docs/source/reference/loading.rst | 41 --- docs/source/reference/lookup.rst | 111 ------- docs/source/reference/processing.rst | 89 ------ docs/source/reference/using/document.rst | 94 ------ docs/source/reference/using/index.rst | 11 - docs/source/reference/using/span.rst | 58 ---- docs/source/reference/using/token.rst | 195 ------------ docs/source/tutorials/lexrank_tutorial.rst | 280 ----------------- docs/source/updates.rst | 233 -------------- 21 files changed, 3282 deletions(-) delete mode 100644 docs/Makefile delete mode 100644 docs/source/conf.py delete mode 100644 docs/source/depr/annotation.rst delete mode 100644 docs/source/depr/api.rst delete mode 100644 docs/source/depr/features.rst delete mode 100644 docs/source/example_wsj0001.json delete mode 100644 docs/source/howworks.rst delete mode 100644 docs/source/index.rst delete mode 100644 docs/source/license.rst delete mode 100644 docs/source/quickstart.rst delete mode 100644 docs/source/reference/annotation.rst delete mode 100644 docs/source/reference/index.rst delete mode 100644 docs/source/reference/loading.rst delete mode 100644 docs/source/reference/lookup.rst delete mode 100644 docs/source/reference/processing.rst delete mode 100644 docs/source/reference/using/document.rst delete mode 100644 docs/source/reference/using/index.rst delete mode 100644 docs/source/reference/using/span.rst delete mode 100644 docs/source/reference/using/token.rst delete mode 100644 docs/source/tutorials/lexrank_tutorial.rst delete mode 100644 docs/source/updates.rst diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index bb80f0928..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = ../../docs-spacy - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/spaCy.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/spaCy.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/spaCy" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/spaCy" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index ac6849abd..000000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,271 +0,0 @@ -# -*- coding: utf-8 -*- -# -# spaCy documentation build configuration file, created by -# sphinx-quickstart on Thu Sep 25 17:47:15 2014. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -import sys -import os - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.insert(0, os.path.abspath('.')) - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.doctest', - 'sphinx.ext.intersphinx', - 'sphinx.ext.viewcode', - 'sphinxcontrib.napoleon', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix of source filenames. -source_suffix = '.rst' - -# The encoding of source files. -#source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'spaCy' -copyright = u'2015, Matthew Honnibal' - - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = '0.85' -# The full version, including alpha/beta/rc tags. -release = '0.85' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -#language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -#today = '' -# Else, today_fmt is used as the format for a strftime call. -#today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = [] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -#default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -#add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -#add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -#show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -#modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -#keep_warnings = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -html_theme_options = { - 'google_analytics_id': 'UA-58931649-1' -} - -# Add any paths that contain custom themes here, relative to this directory. -html_theme_path = ["../_themes"] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -#html_title = None - -# A shorter title for the navigation bar. Default is the same as html_title. -#html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -#html_logo = None - -# The name of an image file (within the static path) to use as favicon of the -# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -#html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -#html_extra_path = [] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -#html_last_updated_fmt = '%b %d, %Y' - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -#html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -#html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -#html_additional_pages = {} - -# If false, no module index is generated. -#html_domain_indices = True - -# If false, no index is generated. -#html_use_index = True - -# If true, the index is split into individual pages for each letter. -#html_split_index = False - -# If true, links to the reST sources are added to the pages. -#html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -#html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -#html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -#html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -#html_file_suffix = None - -# Output file base name for HTML help builder. -htmlhelp_basename = 'spaCydoc' - - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { -# The paper size ('letterpaper' or 'a4paper'). -#'papersize': 'letterpaper', - -# The font size ('10pt', '11pt' or '12pt'). -#'pointsize': '10pt', - -# Additional stuff for the LaTeX preamble. -#'preamble': '', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - ('index', 'spaCy.tex', u'spaCy Documentation', - u'Matthew Honnibal', 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -#latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -#latex_use_parts = False - -# If true, show page references after internal links. -#latex_show_pagerefs = False - -# If true, show URL addresses after external links. -#latex_show_urls = False - -# Documents to append as an appendix to all manuals. -#latex_appendices = [] - -# If false, no module index is generated. -#latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - ('index', 'spacy', u'spaCy Documentation', - [u'Matthew Honnibal'], 1) -] - -# If true, show URL addresses after external links. -#man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ('index', 'spaCy', u'spaCy Documentation', - u'Matthew Honnibal', 'spaCy', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -#texinfo_appendices = [] - -# If false, no module index is generated. -#texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -#texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -#texinfo_no_detailmenu = False - - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = {'http://docs.python.org/': None} diff --git a/docs/source/depr/annotation.rst b/docs/source/depr/annotation.rst deleted file mode 100644 index c19e70bbd..000000000 --- a/docs/source/depr/annotation.rst +++ /dev/null @@ -1,116 +0,0 @@ -==================== -Annotation Standards -==================== - -This document describes the target annotations spaCy is trained to predict. - -This is currently a work in progress. Please ask questions on the issue tracker, -so that the answers can be integrated here to improve the documentation. - -https://github.com/honnibal/spaCy/issues - -English -======= - -Tokenization ------------- - -Tokenization standards are based on the OntoNotes 5 corpus. - -The tokenizer differs from most by including tokens for significant whitespace. -Any sequence of whitespace characters beyond a single space (' ') is included -as a token. For instance: - - >>> from spacy.en import English - >>> nlp = English(parse=False) - >>> tokens = nlp(u'Some\nspaces and\ttab characters') - >>> print [t.orth_ for t in tokens] - [u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters'] - -The whitespace tokens are useful for much the same reason punctuation is --- it's -often an important delimiter in the text. By preserving it in the token output, -we are able to maintain a simple alignment between the tokens and the original -string, and we ensure that the token stream does not lose information. - -Sentence boundary detection ---------------------------- - -Sentence boundaries are calculated from the syntactic parse tree, so features -such as punctuation and capitalisation play an important but non-decisive role -in determining the sentence boundaries. Usually this means that the sentence -boundaries will at least coincide with clause boundaries, even given poorly -punctuated text. - -Part-of-speech Tagging ----------------------- - -The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank -tag set. We also map the tags to the simpler Google Universal POS Tag set. - -Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - -Lemmatization -------------- - -A "lemma" is the uninflected form of a word. In English, this means: - -* Adjectives: The form like "happy", not "happier" or "happiest" -* Adverbs: The form like "badly", not "worse" or "worst" -* Nouns: The form like "dog", not "dogs"; like "child", not "children" -* Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - -The lemmatization data is taken from WordNet. However, we also add a special -case for pronouns: all pronouns are lemmatized to the special token -PRON-. - -Syntactic Dependency Parsing ----------------------------- - -The parser is trained on data produced by the ClearNLP converter. Details of -the annotation scheme can be found here: - -http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - -Named Entity Recognition ------------------------- - - +--------------+-----------------------------------------------------+ - | PERSON | People, including fictional | - +--------------+-----------------------------------------------------+ - | NORP | Nationalities or religious or political groups | - +--------------+-----------------------------------------------------+ - | FACILITY | Buildings, airports, highways, bridges, etc. | - +--------------+-----------------------------------------------------+ - | ORGANIZATION | Companies, agencies, institutions, etc. | - +--------------+-----------------------------------------------------+ - | GPE | Countries, cities, states | - +--------------+-----------------------------------------------------+ - | LOCATION | Non-GPE locations, mountain ranges, bodies of water | - +--------------+-----------------------------------------------------+ - | PRODUCT | Vehicles, weapons, foods, etc. (Not services) | - +--------------+-----------------------------------------------------+ - | EVENT | Named hurricanes, battles, wars, sports events, etc.| - +--------------+-----------------------------------------------------+ - | WORK OF ART | Titles of books, songs, etc. | - +--------------+-----------------------------------------------------+ - | LAW | Named documents made into laws | - +--------------+-----------------------------------------------------+ - | LANGUAGE | Any named language | - +--------------+-----------------------------------------------------+ - -The following values are also annotated in a style similar to names: - - +--------------+---------------------------------------------+ - | DATE | Absolute or relative dates or periods | - +--------------+---------------------------------------------+ - | TIME | Times smaller than a day | - +--------------+---------------------------------------------+ - | PERCENT | Percentage (including “%”) | - +--------------+---------------------------------------------+ - | MONEY | Monetary values, including unit | - +--------------+---------------------------------------------+ - | QUANTITY | Measurements, as of weight or distance | - +--------------+---------------------------------------------+ - | ORDINAL | "first", "second" | - +--------------+---------------------------------------------+ - | CARDINAL | Numerals that do not fall under another type| - +--------------+---------------------------------------------+ diff --git a/docs/source/depr/api.rst b/docs/source/depr/api.rst deleted file mode 100644 index 8b1378917..000000000 --- a/docs/source/depr/api.rst +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/source/depr/features.rst b/docs/source/depr/features.rst deleted file mode 100644 index ecd465182..000000000 --- a/docs/source/depr/features.rst +++ /dev/null @@ -1,77 +0,0 @@ -Lexeme Features -=============== - -A lexeme is an entry in the lexicon --- the vocabulary --- for a word, punctuation -symbol, whitespace unit, etc. Lexemes come with lots of pre-computed information, -that help you write good feature functions. Features are integer-valued where -possible --- instead of strings, spaCy refers to strings by consecutive ID numbers, -which you can use to look up the string values if necessary. - -String features ---------------- - -+---------+-------------------------------------------------------------------+ -| SIC | The word as it appeared in the sentence, unaltered. | -+---------+-------------------------------------------------------------------+ -| NORM | For frequent words, case normalization is applied. | -| | Otherwise, back-off to SHAPE. | -+---------+-------------------------------------------------------------------+ -| SHAPE | Remap the characters of the word as follows: | -| | | -| | a-z --> x, A-Z --> X, 0-9 --> d, ,.;:"'?!$- --> self, other --> \*| -| | | -| | Trim sequences of length 3+ to 3, e.g | -| | | -| | apples --> xxx, Apples --> Xxxx, app9LES@ --> xxx9XXX* | -+---------+-------------------------------------------------------------------+ -| ASCIIED | Use unidecode.unidecode(sic) to approximate the word using the | -| | ascii characters. | -+---------+-------------------------------------------------------------------+ -| PREFIX | sic_unicode_string[:1] | -+---------+-------------------------------------------------------------------+ -| SUFFIX | sic_unicode_string[-3:] | -+---------+-------------------------------------------------------------------+ - - -Integer features ----------------- - -+--------------+--------------------------------------------------------------+ -| LENGTH | Length of the string, in unicode | -+--------------+--------------------------------------------------------------+ -| CLUSTER | Brown cluster | -+--------------+--------------------------------------------------------------+ -| POS_TYPE | K-means cluster of word's tag affinities | -+--------------+--------------------------------------------------------------+ -| SENSE_TYPE | K-means cluster of word's sense affinities | -+--------------+--------------------------------------------------------------+ - -Boolean features ----------------- - -+-------------+--------------------------------------------------------------+ -| IS_ALPHA | The result of sic.isalpha() | -+-------------+--------------------------------------------------------------+ -| IS_ASCII | Check whether all the word's characters are ascii characters | -+-------------+--------------------------------------------------------------+ -| IS_DIGIT | The result of sic.isdigit() | -+-------------+--------------------------------------------------------------+ -| IS_LOWER | The result of sic.islower() | -+-------------+--------------------------------------------------------------+ -| IS_PUNCT | Check whether all characters are in the class TODO | -+-------------+--------------------------------------------------------------+ -| IS_SPACE | The result of sic.isspace() | -+-------------+--------------------------------------------------------------+ -| IS_TITLE | The result of sic.istitle() | -+-------------+--------------------------------------------------------------+ -| IS_UPPER | The result of sic.isupper() | -+-------------+--------------------------------------------------------------+ -| LIKE_URL | Check whether the string looks like it could be a URL. Aims | -| | for low false negative rate. | -+-------------+--------------------------------------------------------------+ -| LIKE_NUMBER | Check whether the string looks like it could be a numeric | -| | entity, e.g. 10,000 10th .10 . Skews for low false negative | -| | rate. | -+-------------+--------------------------------------------------------------+ -| IN_LIST | Facility for loading arbitrary run-time word lists? | -+-------------+--------------------------------------------------------------+ diff --git a/docs/source/example_wsj0001.json b/docs/source/example_wsj0001.json deleted file mode 100644 index 25d1cf5c7..000000000 --- a/docs/source/example_wsj0001.json +++ /dev/null @@ -1,337 +0,0 @@ -{ - "id": "wsj_0001", - "paragraphs": [ - { - "raw": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29. Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.", - - "segmented": "Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group.", - - "sents": [ - 0, - 85 - ], - - "tokens": [ - { - "dep": "NMOD", - "start": 0, - "head": 7, - "tag": "NNP", - "orth": "Pierre" - }, - { - "dep": "SUB", - "start": 7, - "head": 29, - "tag": "NNP", - "orth": "Vinken" - }, - { - "dep": "P", - "start": 13, - "head": 7, - "tag": ",", - "orth": "," - }, - { - "dep": "NMOD", - "start": 15, - "head": 18, - "tag": "CD", - "orth": "61" - }, - { - "dep": "AMOD", - "start": 18, - "head": 24, - "tag": "NNS", - "orth": "years" - }, - { - "dep": "NMOD", - "start": 24, - "head": 7, - "tag": "JJ", - "orth": "old" - }, - { - "dep": "P", - "start": 27, - "head": 7, - "tag": ",", - "orth": "," - }, - { - "dep": "ROOT", - "start": 29, - "head": -1, - "tag": "MD", - "orth": "will" - }, - { - "dep": "VC", - "start": 34, - "head": 29, - "tag": "VB", - "orth": "join" - }, - { - "dep": "NMOD", - "start": 39, - "head": 43, - "tag": "DT", - "orth": "the" - }, - { - "dep": "OBJ", - "start": 43, - "head": 34, - "tag": "NN", - "orth": "board" - }, - { - "dep": "VMOD", - "start": 49, - "head": 34, - "tag": "IN", - "orth": "as" - }, - { - "dep": "NMOD", - "start": 52, - "head": 67, - "tag": "DT", - "orth": "a" - }, - { - "dep": "NMOD", - "start": 54, - "head": 67, - "tag": "JJ", - "orth": "nonexecutive" - }, - { - "dep": "PMOD", - "start": 67, - "head": 49, - "tag": "NN", - "orth": "director" - }, - { - "dep": "VMOD", - "start": 76, - "head": 34, - "tag": "NNP", - "orth": "Nov." - }, - { - "dep": "NMOD", - "start": 81, - "head": 76, - "tag": "CD", - "orth": "29" - }, - { - "dep": "P", - "start": 83, - "head": 29, - "tag": ".", - "orth": "." - }, - { - "dep": "NMOD", - "start": 85, - "head": 89, - "tag": "NNP", - "orth": "Mr." - }, - { - "dep": "SUB", - "start": 89, - "head": 96, - "tag": "NNP", - "orth": "Vinken" - }, - { - "dep": "ROOT", - "start": 96, - "head": -1, - "tag": "VBZ", - "orth": "is" - }, - { - "dep": "PRD", - "start": 99, - "head": 96, - "tag": "NN", - "orth": "chairman" - }, - { - "dep": "NMOD", - "start": 108, - "head": 99, - "tag": "IN", - "orth": "of" - }, - { - "dep": "NMOD", - "start": 111, - "head": 120, - "tag": "NNP", - "orth": "Elsevier" - }, - { - "dep": "NMOD", - "start": 120, - "head": 147, - "tag": "NNP", - "orth": "N.V." - }, - { - "dep": "P", - "start": 124, - "head": 147, - "tag": ",", - "orth": "," - }, - { - "dep": "NMOD", - "start": 126, - "head": 147, - "tag": "DT", - "orth": "the" - }, - { - "dep": "NMOD", - "start": 130, - "head": 147, - "tag": "NNP", - "orth": "Dutch" - }, - { - "dep": "NMOD", - "start": 136, - "head": 147, - "tag": "VBG", - "orth": "publishing" - }, - { - "dep": "PMOD", - "start": 147, - "head": 108, - "tag": "NN", - "orth": "group" - }, - { - "dep": "P", - "start": 152, - "head": 96, - "tag": ".", - "orth": "." - } - ], - "brackets": [ - { - "start": 0, - "end": 7, - "label": "NP" - }, - { - "start": 15, - "end": 18, - "label": "NP" - }, - { - "start": 15, - "end": 24, - "label": "ADJP" - }, - { - "start": 0, - "end": 27, - "label": "NP-SBJ" - }, - { - "start": 39, - "end": 43, - "label": "NP" - }, - { - "start": 52, - "end": 67, - "label": "NP" - }, - { - "start": 49, - "end": 67, - "label": "PP-CLR" - }, - { - "start": 76, - "end": 81, - "label": "NP-TMP" - }, - { - "start": 34, - "end": 81, - "label": "VP" - }, - { - "start": 29, - "end": 81, - "label": "VP" - }, - { - "start": 0, - "end": 83, - "label": "S" - }, - { - "start": 85, - "end": 89, - "label": "NP-SBJ" - }, - { - "start": 99, - "end": 99, - "label": "NP" - }, - { - "start": 111, - "end": 120, - "label": "NP" - }, - { - "start": 126, - "end": 147, - "label": "NP" - }, - { - "start": 111, - "end": 147, - "label": "NP" - }, - { - "start": 108, - "end": 147, - "label": "PP" - }, - { - "start": 99, - "end": 147, - "label": "NP-PRD" - }, - { - "start": 96, - "end": 147, - "label": "VP" - }, - { - "start": 85, - "end": 152, - "label": "S" - } - ] - } - ] -} diff --git a/docs/source/howworks.rst b/docs/source/howworks.rst deleted file mode 100644 index 00d61d66d..000000000 --- a/docs/source/howworks.rst +++ /dev/null @@ -1,262 +0,0 @@ -How spaCy Works -=============== - -The following are some hasty preliminary notes on how spaCy works. The short -story is, there are no new killer algorithms. The way that the tokenizer works -is novel and a bit neat, and the parser has a new feature set, but otherwise -the key algorithms are well known in the recent literature. - -Some might also wonder how I get Python code to run so fast. I don't --- spaCy -is written in `Cython`_, an optionally statically-typed language that compiles -to C or C++, which is then loaded as a C extension module. -This makes it `easy to achieve the performance of native C code`_, but allows the -use of Python language features, via the Python C API. The Python unicode -library was particularly useful to me. I think it would have been much more -difficult to write spaCy in another language. - -.. _Cython: http://cython.org/ - -.. _easy to achieve the performance of native C code: https://honnibal.wordpress.com/2014/10/21/writing-c-in-cython/ - -Tokenizer and Lexicon ---------------------- - -Tokenization is the task of splitting a string into meaningful pieces, called -tokens, which you can then compute with. In practice, the task is usually to -match the tokenization performed in some treebank, or other corpus. If we want -to apply a tagger, entity recogniser, parser etc, then we want our run-time -text to match the training conventions. If we want to use a model that's been -trained to expect "isn't" to be split into two tokens, ["is", "n't"], then that's -how we need to prepare our data. - -In order to train spaCy's models with the best data available, I therefore -tokenize English according to the Penn Treebank scheme. It's not perfect, but -it's what everybody is using, and it's good enough. - -What we don't do -################ - -The Penn Treebank was distributed with a script called tokenizer.sed, which -tokenizes ASCII newswire text roughly according to the Penn Treebank standard. -Almost all tokenizers are based on these regular expressions, with various -updates to account for unicode characters, and the fact that it's no longer -1986 --- today's text has URLs, emails, emoji, etc. - -Usually, the resulting regular expressions are applied in multiple passes, which -is quite inefficient. Often no care is taken to preserve indices into the original -string. If you lose these indices, it'll be difficult to calculate mark-up based -on your annotations. - -Tokenizer Algorithm -################### - -spaCy's tokenizer assumes that no tokens will cross whitespace --- there will -be no multi-word tokens. If we want these, we can post-process the -token-stream later, merging as necessary. This assumption allows us to deal -only with small chunks of text. We can cache the processing of these, and -simplify our expressions somewhat. - -Here is what the outer-loop would look like in Python. (You can see the -production implementation, in Cython, here.) - -.. code:: python - - cache = {} - def tokenize(text): - tokens = [] - for substring in text.split(' '): - if substring in cache: - tokens.extend(cache[substring]) - else: - subtokens = _tokenize_substring(substring) - tokens.extend(subtokens) - cache[substring] = subtokens - return tokens - -The actual work is performed in _tokenize_substring. For this, I divide the -tokenization rules into three pieces: - -1. A prefixes expression, which matches from the start of the string; -2. A suffixes expression, which matches from the end of the string; -3. A special-cases table, which matches the whole string. - -The algorithm then proceeds roughly like this (consider this like pseudo-code; -this was written quickly and has not been executed): - -.. code:: python - - # Tokens which can be attached at the beginning or end of another - prefix_re = _make_re([",", '"', '(', ...]) - suffix_re = _make_re(s[",", "'", ":", "'s", ...]) - - # Contractions etc are simply enumerated, since they're a finite set. We - # can also specify anything we like here, which is nice --- different data - # has different quirks, so we want to be able to add ad hoc exceptions. - special_cases = { - "can't": ("ca", "n't"), - "won't": ("wo", "n't"), - "he'd've": ("he", "'d", "'ve"), - ... - ":)": (":)",) # We can add any arbitrary thing to this list. - } - - def _tokenize_substring(substring): - prefixes = [] - suffixes = [] - while substring not in special_cases: - prefix, substring = _apply_re(substring, prefix_re) - if prefix: - prefixes.append(prefix) - else: - suffix, substring = _apply_re(substring, suffix_re) - if suffix: - suffixes.append(suffix) - else: - break - - -This procedure splits off tokens from the start and end of the string, at each -point checking whether the remaining string is in our special-cases table. If -it is, we stop splitting, and return the tokenization at that point. - -The advantage of this design is that the prefixes, suffixes and special-cases -can be declared separately, in easy-to-understand files. If a new entry is -added to the special-cases, you can be sure that it won't have some unforeseen -consequence to a complicated regular-expression grammar. - -Coupling the Tokenizer and Lexicon -################################## - -As mentioned above, the tokenizer is designed to support easy caching. If all -we were caching were the matched substrings, this would not be so advantageous. -Instead, what we do is create a struct which houses all of our lexical -features, and cache *that*. The tokens are then simply pointers to these rich -lexical types. - -In a sample of text, vocabulary size grows exponentially slower than word -count. So any computations we can perform over the vocabulary and apply to the -word count are efficient. - - -Part-of-speech Tagger ---------------------- - -.. _how to write a good part of speech tagger: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ . - -In 2013, I wrote a blog post describing `how to write a good part of speech -tagger`_. -My recommendation then was to use greedy decoding with the averaged perceptron. -I think this is still the best approach, so it's what I implemented in spaCy. - -The tutorial also recommends the use of Brown cluster features, and case -normalization features, as these make the model more robust and domain -independent. spaCy's tagger makes heavy use of these features. - -Dependency Parser ------------------ - -.. _2014 blog post: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/ - -The parser uses the algorithm described in my `2014 blog post`_. -This algorithm, shift-reduce dependency parsing, is becoming widely adopted due -to its compelling speed/accuracy trade-off. - -Some quick details about spaCy's take on this, for those who happen to know -these models well. I'll write up a better description shortly. - -1. I use greedy decoding, not beam search; -2. I use the arc-eager transition system; -3. I use the Goldberg and Nivre (2012) dynamic oracle. -4. I use the non-monotonic update from my CoNLL 2013 paper (Honnibal, Goldberg - and Johnson 2013). - -So far, this is exactly the configuration from the CoNLL 2013 paper, which -scored 91.0. So how have I gotten it to 92.4? The following tweaks: - -1. I use Brown cluster features --- these help a lot; -2. I redesigned the feature set. I've long known that the Zhang and Nivre - (2011) feature set was suboptimal, but a few features don't make a very - compelling publication. Still, they're important. -3. When I do the dynamic oracle training, I also make - the upate cost-sensitive: if the oracle determines that the move the parser - took has a cost of N, then the weights for the gold class are incremented by - +N, and the weights for the predicted class are incremented by -N. This - only made a small (0.1-0.2%) difference. - -Implementation -############## - -I don't do anything algorithmically novel to improve the efficiency of the -parser. However, I was very careful in the implementation. - -A greedy shift-reduce parser with a linear model boils down to the following -loop: - -.. code:: python - - def parse(words, model, feature_funcs, n_classes): - state = init_state(words) - for _ in range(len(words) * 2): - features = [templ(state) for templ in feature_funcs] - scores = [0 for _ in range(n_classes)] - for feat in features: - weights = model[feat] - for i, weight in enumerate(weights): - scores[i] += weight - class_, score = max(enumerate(scores), key=lambda item: item[1]) - transition(state, class_) - -The parser makes 2N transitions for a sentence of length N. In order to select -the transition, it extracts a vector of K features from the state. Each feature -is used as a key into a hash table managed by the model. The features map to -a vector of weights, of length C. We then dot product the feature weights to the -scores vector we are building for that instance. - -The inner-most loop here is not so bad: we only have a few dozen classes, so -it's just a short dot product. Both of the vectors are in the cache, so this -is a snack to a modern CPU. - -The bottle-neck in this algorithm is the 2NK look-ups into the hash-table that -we must make, as these almost always have to hit main memory. The feature-set -is enormously large, because all of our features are one-hot boolean -indicators. Some of the features will be common, so they'll lurk around in the -CPU's cache hierarchy. But a lot of them won't be, and accessing main memory -takes a lot of cycles. - -.. _Jeff Preshing's excellent post: http://preshing.com/20130107/this-hash-table-is-faster-than-a-judy-array/ . - -I used to use the Google dense_hash_map implementation. This seemed a solid -choice: it came from a big brand, it was in C++, and it seemed very -complicated. Later, I read `Jeff Preshing's excellent post`_ on open-addressing -with linear probing. -This really spoke to me. I had assumed that a fast hash table implementation -would necessarily be very complicated, but no --- this is another situation -where the simple strategy wins. - -I've packaged my Cython implementation separately from spaCy, in the package -`preshed`_ --- for "pre-hashed", but also as a nod to Preshing. I've also taken -great care over the feature extraction and perceptron code, which I'm distributing -in a package named `thinc`_ (since it's for learning very sparse models with -Cython). - -.. _preshed: https://github.com/syllog1sm/preshed - -.. _thinc: https://github.com/honnibal/thinc - -By the way: from comparing notes with a few people, it seems common to -implement linear models in a way that's suboptimal for multi-class -classification. The mistake is to store in the hash-table one weight per -(feature, class) pair, rather than mapping the feature to a vector of weights, -for all of the classes. This is bad because it means you need to hit the table -C times, one per class, as you always need to evaluate a feature against all of -the classes. In the case of the parser, this means the hash table is accessed -2NKC times, instead of the 2NK times if you have a weights vector. You should -also be careful to store the weights contiguously in memory --- you don't want -a linked list here. I use a block-sparse format, because my problems tend to -have a few dozen classes. - -I guess if I had to summarize my experience, I'd say that the efficiency of -these models is really all about the data structures. We want to stay small, -and stay contiguous. Minimize redundancy and minimize pointer chasing. -That's why Cython is so well suited to this: we get to lay out our data -structures, and manage the memory ourselves, with full C-level control. diff --git a/docs/source/index.rst b/docs/source/index.rst deleted file mode 100644 index 8aa457595..000000000 --- a/docs/source/index.rst +++ /dev/null @@ -1,339 +0,0 @@ -.. spaCy documentation master file, created by - sphinx-quickstart on Tue Aug 19 16:27:38 2014. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -============================== -spaCy: Industrial-strength NLP -============================== - - -.. _Issue Tracker: https://github.com/honnibal/spaCy/issues - -**2015-07-08**: `Version 0.88 released`_ - -.. _Version 0.87 released: updates.html - -`spaCy`_ is a new library for text processing in Python and Cython. -I wrote it because I think small companies are terrible at -natural language processing (NLP). Or rather: -small companies are using terrible NLP technology. - -.. _spaCy: https://github.com/honnibal/spaCy/ - -To do great NLP, you have to know a little about linguistics, a lot -about machine learning, and almost everything about the latest research. -The people who fit this description seldom join small companies. -Most are broke --- they've just finished grad school. -If they don't want to stay in academia, they join Google, IBM, etc. - -The net result is that outside of the tech giants, commercial NLP has changed -little in the last ten years. In academia, it's changed entirely. Amazing -improvements in quality. Orders of magnitude faster. But the -academic code is always GPL, undocumented, unuseable, or all three. You could -implement the ideas yourself, but the papers are hard to read, and training -data is exorbitantly expensive. So what are you left with? A common answer is -NLTK, which was written primarily as an educational resource. Nothing past the -tokenizer is suitable for production use. - -I used to think that the NLP community just needed to do more to communicate -its findings to software engineers. So I wrote two blog posts, explaining -`how to write a part-of-speech tagger`_ and `parser`_. Both were well received, -and there's been a bit of interest in `my research software`_ --- even though -it's entirely undocumented, and mostly unuseable to anyone but me. - -.. _`my research software`: https://github.com/syllog1sm/redshift/tree/develop - -.. _`how to write a part-of-speech tagger`: https://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ - -.. _`parser`: https://honnibal.wordpress.com/2013/12/18/a-simple-fast-algorithm-for-natural-language-dependency-parsing/ - -So six months ago I quit my post-doc, and I've been working day and night on -spaCy since. I'm now pleased to announce an alpha release. - -If you're a small company doing NLP, I think spaCy will seem like a minor miracle. -It's by far the fastest NLP software ever released. -The full processing pipeline completes in 20ms per document, including accurate -tagging and parsing. All strings are mapped to integer IDs, tokens are linked -to embedded word representations, and a range of useful features are pre-calculated -and cached. - -If none of that made any sense to you, here's the gist of it. Computers don't -understand text. This is unfortunate, because that's what the web almost entirely -consists of. We want to recommend people text based on other text they liked. -We want to shorten text to display it on a mobile screen. We want to aggregate -it, link it, filter it, categorise it, generate it and correct it. - -spaCy provides a library of utility functions that help programmers build such -products. It's commercial open source software: you can either use it under -the AGPL, or you can `buy a commercial license`_ for a one-time fee. - -.. _buy a commercial license: license.html - -Example functionality ---------------------- - -Let's say you're developing a proofreading tool, or possibly an IDE for -writers. You're convinced by Stephen King's advice that `adverbs are not your -friend `_, so -you want to **highlight all adverbs**. We'll use one of the examples he finds -particularly egregious: - - >>> import spacy.en - >>> from spacy.parts_of_speech import ADV - >>> # Load the pipeline, and call it with some text. - >>> nlp = spacy.en.English() - >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", tag=True, parse=False) - >>> print u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - u‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ - - -Easy enough --- but the problem is that we've also highlighted "back". -While "back" is undoubtedly an adverb, we probably don't want to highlight it. -If what we're trying to do is flag dubious stylistic choices, we'll need to -refine our logic. It turns out only a certain type of adverb is of interest to -us. - -There are lots of ways we might do this, depending on just what words -we want to flag. The simplest way to exclude adverbs like "back" and "not" -is by word frequency: these words are much more common than the prototypical -manner adverbs that the style guides are worried about. - -The :py:attr:`Lexeme.prob` and :py:attr:`Token.prob` attribute gives a -log probability estimate of the word: - - >>> nlp.vocab[u'back'].prob - -7.403977394104004 - >>> nlp.vocab[u'not'].prob - -5.407193660736084 - >>> nlp.vocab[u'quietly'].prob - -11.07155704498291 - -(The probability estimate is based on counts from a 3 billion word corpus, -smoothed using the `Simple Good-Turing`_ method.) - -.. _`Simple Good-Turing`: http://www.d.umn.edu/~tpederse/Courses/CS8761-FALL02/Code/sgt-gale.pdf - -So we can easily exclude the N most frequent words in English from our adverb -marker. Let's try N=1000 for now: - - >>> import spacy.en - >>> from spacy.parts_of_speech import ADV - >>> nlp = spacy.en.English() - >>> # Find log probability of Nth most frequent word - >>> probs = [lex.prob for lex in nlp.vocab] - >>> probs.sort() - >>> is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - >>> tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - >>> print u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - ‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’ - -There are lots of other ways we could refine the logic, depending on just what -words we want to flag. Let's say we wanted to only flag adverbs that modified words -similar to "pleaded". This is easy to do, as spaCy loads a vector-space -representation for every word (by default, the vectors produced by -`Levy and Goldberg (2014)`_). Naturally, the vector is provided as a numpy -array: - - >>> pleaded = tokens[7] - >>> pleaded.repvec.shape - (300,) - >>> pleaded.repvec[:5] - array([ 0.04229792, 0.07459262, 0.00820188, -0.02181299, 0.07519238], dtype=float32) - -.. _Levy and Goldberg (2014): https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/ - -We want to sort the words in our vocabulary by their similarity to "pleaded". -There are lots of ways to measure the similarity of two vectors. We'll use the -cosine metric: - - >>> from numpy import dot - >>> from numpy.linalg import norm - - >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - >>> words = [w for w in nlp.vocab if w.has_repvec] - >>> words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - >>> words.reverse() - >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) - 1-20 pleaded, pled, plead, confessed, interceded, pleads, testified, conspired, motioned, demurred, countersued, remonstrated, begged, apologised, consented, acquiesced, petitioned, quarreled, appealed, pleading - >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) - 50-60 counselled, bragged, backtracked, caucused, refiled, dueled, mused, dissented, yearned, confesses - >>> print('100-110', ', '.join(w.orth_ for w in words[100:110])) - 100-110 cabled, ducked, sentenced, perjured, absconded, bargained, overstayed, clerked, confided, sympathizes - >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) - 1000-1010 scorned, baled, righted, requested, swindled, posited, firebombed, slimed, deferred, sagged - >>> print('50000-50010', ', '.join(w.orth_ for w in words[50000:50010])) - 50000-50010, fb, ford, systems, puck, anglers, ik, tabloid, dirty, rims, artists - -As you can see, the similarity model that these vectors give us is excellent ---- we're still getting meaningful results at 1000 words, off a single -prototype! The only problem is that the list really contains two clusters of -words: one associated with the legal meaning of "pleaded", and one for the more -general sense. Sorting out these clusters is an area of active research. - - -A simple work-around is to average the vectors of several words, and use that -as our target: - - >>> say_verbs = ['pleaded', 'confessed', 'remonstrated', 'begged', 'bragged', 'confided', 'requested'] - >>> say_vector = sum(nlp.vocab[verb].repvec for verb in say_verbs) / len(say_verbs) - >>> words.sort(key=lambda w: cosine(w.repvec * say_vector)) - >>> words.reverse() - >>> print('1-20', ', '.join(w.orth_ for w in words[0:20])) - 1-20 bragged, remonstrated, enquired, demurred, sighed, mused, intimated, retorted, entreated, motioned, ranted, confided, countersued, gestured, implored, interceded, muttered, marvelled, bickered, despaired - >>> print('50-60', ', '.join(w.orth_ for w in words[50:60])) - 50-60 flaunted, quarrelled, ingratiated, vouched, agonized, apologised, lunched, joked, chafed, schemed - >>> print('1000-1010', ', '.join(w.orth_ for w in words[1000:1010])) - 1000-1010 hoarded, waded, ensnared, clamoring, abided, deploring, shriveled, endeared, rethought, berate - -These definitely look like words that King might scold a writer for attaching -adverbs to. Recall that our original adverb highlighting function looked like -this: - - >>> import spacy.en - >>> from spacy.parts_of_speech import ADV - >>> # Load the pipeline, and call it with some text. - >>> nlp = spacy.en.English() - >>> tokens = nlp("‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - tag=True, parse=False) - >>> print(''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens)) - ‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’ - - - -We wanted to refine the logic so that only adverbs modifying evocative verbs -of communication, like "pleaded", were highlighted. We've now built a vector that -represents that type of word, so now we can highlight adverbs based on -subtle logic, honing in on adverbs that seem the most stylistically -problematic, given our starting assumptions: - - >>> import numpy - >>> from numpy import dot - >>> from numpy.linalg import norm - >>> import spacy.en - >>> from spacy.parts_of_speech import ADV, VERB - >>> cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - >>> def is_bad_adverb(token, target_verb, tol): - ... if token.pos != ADV - ... return False - ... elif token.head.pos != VERB: - ... return False - ... elif cosine(token.head.repvec, target_verb) < tol: - ... return False - ... else: - ... return True - - -This example was somewhat contrived --- and, truth be told, I've never really -bought the idea that adverbs were a grave stylistic sin. But hopefully it got -the message across: the state-of-the-art NLP technologies are very powerful. -spaCy gives you easy and efficient access to them, which lets you build all -sorts of use products and features that were previously impossible. - - -Independent Evaluation ----------------------- - -.. table:: Independent evaluation by Yahoo! Labs and Emory - University, to appear at ACL 2015. Higher is better. - - +----------------+------------+------------+------------+ - | System | Language | Accuracy | Speed | - +----------------+------------+------------+------------+ - | spaCy v0.86 | Cython | 91.9 | **13,963** | - +----------------+------------+------------+------------+ - | ClearNLP | Java | 91.7 | 10,271 | - +----------------+------------+------------+------------+ - | spaCy v0.84 | Cython | 90.9 | 13,963 | - +----------------+------------+------------+------------+ - | CoreNLP | Java | 89.6 | 8,602 | - +----------------+------------+------------+------------+ - | MATE | Java | **92.5** | 550 | - +----------------+------------+------------+------------+ - | Turbo | C++ | 92.4 | 349 | - +----------------+------------+------------+------------+ - | Yara | Java | 92.3 | 340 | - +----------------+------------+------------+------------+ - - -Accuracy is % unlabelled arcs correct, speed is tokens per second. - -Joel Tetreault and Amanda Stent (Yahoo! Labs) and Jin-ho Choi (Emory) performed -a detailed comparison of the best parsers available. All numbers above -are taken from the pre-print they kindly made available to me, -except for spaCy v0.86. - -I'm particularly grateful to the authors for discussion of their results, which -led to the improvement in accuracy between v0.84 and v0.86. A tip from Jin-ho -(developer of ClearNLP) was particularly useful. - - -Detailed Speed Comparison -------------------------- - -**Set up**: 100,000 plain-text documents were streamed from an SQLite3 -database, and processed with an NLP library, to one of three levels of detail ---- tokenization, tagging, or parsing. The tasks are additive: to parse the -text you have to tokenize and tag it. The pre-processing was not subtracted -from the times --- I report the time required for the pipeline to complete. -I report mean times per document, in milliseconds. - -**Hardware**: Intel i7-3770 (2012) - -.. table:: Per-document processing times. Lower is better. - - +--------------+---------------------------+--------------------------------+ - | | Absolute (ms per doc) | Relative (to spaCy) | - +--------------+----------+--------+-------+----------+---------+-----------+ - | System | Tokenize | Tag | Parse | Tokenize | Tag | Parse | - +--------------+----------+--------+-------+----------+---------+-----------+ - | spaCy | 0.2ms | 1ms | 19ms | 1x | 1x | 1x | - +--------------+----------+--------+-------+----------+---------+-----------+ - | CoreNLP | 2ms | 10ms | 49ms | 10x | 10x | 2.6x | - +--------------+----------+--------+-------+----------+---------+-----------+ - | ZPar | 1ms | 8ms | 850ms | 5x | 8x | 44.7x | - +--------------+----------+--------+-------+----------+---------+-----------+ - | NLTK | 4ms | 443ms | n/a | 20x | 443x | n/a | - +--------------+----------+--------+-------+----------+---------+-----------+ - - -Efficiency is a major concern for NLP applications. It is very common to hear -people say that they cannot afford more detailed processing, because their -datasets are too large. This is a bad position to be in. If you can't apply -detailed processing, you generally have to cobble together various heuristics. -This normally takes a few iterations, and what you come up with will usually be -brittle and difficult to reason about. - -spaCy's parser is faster than most taggers, and its tokenizer is fast enough -for any workload. And the tokenizer doesn't just give you a list -of strings. A spaCy token is a pointer to a Lexeme struct, from which you can -access a wide range of pre-computed features, including embedded word -representations. - -.. I wrote spaCy because I think existing commercial NLP engines are crap. - Alchemy API are a typical example. Check out this part of their terms of - service: - publish or perform any benchmark or performance tests or analysis relating to - the Service or the use thereof without express authorization from AlchemyAPI; - -.. Did you get that? You're not allowed to evaluate how well their system works, - unless you're granted a special exception. Their system must be pretty - terrible to motivate such an embarrassing restriction. - They must know this makes them look bad, but they apparently believe allowing - you to evaluate their product would make them look even worse! - -.. spaCy is based on science, not alchemy. It's open source, and I am happy to - clarify any detail of the algorithms I've implemented. - It's evaluated against the current best published systems, following the standard - methodologies. These evaluations show that it performs extremely well. -.. See `Benchmarks`_ for details. - - -.. toctree:: - :maxdepth: 4 - :hidden: - - quickstart.rst - reference/index.rst - license.rst - updates.rst diff --git a/docs/source/license.rst b/docs/source/license.rst deleted file mode 100644 index 7f3b55418..000000000 --- a/docs/source/license.rst +++ /dev/null @@ -1,126 +0,0 @@ -======= -License -======= - -* Download the `license agreement`_ -* Get in touch: matt@spacy.io - -.. _license agreement: spacy_trial_free.docx - - - +------------+-----------+----------+-------------------------------------+ - | License | Price | Term | Suitable for | - +============+===========+==========+=====================================+ - | Commercial | $5,000 | Life | Production use | - +------------+-----------+----------+-------------------------------------+ - | Trial | $0 | 90 days | Evaluation, seed startup | - +------------+-----------+----------+-------------------------------------+ - | AGPLv3 | Free | Life | Research, teaching, hobbyists, FOSS | - +------------+-----------+----------+-------------------------------------+ - - -To make spaCy as valuable as possible, licenses to it are for life. You get -complete transparency, certainty and control. -If you need to use spaCy as an API, it's trivial to host it yourself --- and -you don't need to worry about the service changing or disappearing. -And if you're ever in acquisition or IPO talks, the story is simple. - -spaCy can also be used as free open-source software, under the Aferro GPL -license. If you use it this way, you must comply with the AGPL license terms. -When you distribute your project, or offer it as a network service, you must -distribute the source-code and grant users an AGPL license to it. - - -.. I left academia in June 2014, just when I should have been submitting my first - grant proposal. Grant writing seemed a bad business model. I wasn't sure - exactly what I would do instead, but I knew that the work I could do was - valuable, and that it would make sense for people to pay me to do it, and that - it's often easy to convince smart people of things that are true. - -.. I left because I don't like the grant system. It's not the - best way to create value, and it's not the best way to get paid. - - -Examples --------- - -In order to clarify how spaCy's license structure might apply to you, I've -written a few examples, in the form of user-stories. - -Ashley and Casey: Seed stage start-up -##################################### - -Ashley and Casey have an idea for a start-up. To explore their idea, they want -to build a minimum viable product they can put in front of potential users and -investors. - -They have two options. - - 1. **Trial commercial license.** With a simple form, they can use spaCy for 90 - days, for a nominal fee of $1. They are free to modify spaCy, and they - will own the copyright to their modifications for the duration of the license. - After the trial period elapses, they can either pay the license fee, stop - using spaCy, release their project under the AGPL. - - 2. **AGPL.** Casey and Pat can instead use spaCy under the AGPL license. - However, they must then release any code that statically or dynamically - links to spaCy under the AGPL as well (e.g. if they import the module, or - import a module that imports it, etc). They also cannot use spaCy as - a network resource, by running it as a service --- this is the - loophole that the "A" part of the AGPL is designed to close. - -Ashley and Casey find the AGPL license unattractive for commercial use. -They decide to take up the trial commercial license. -However, over the next 90 days, Ashley has to move house twice, and Casey gets -sick. By the time the trial expires, they still don't have a demo they can show -investors. They send an email explaining the situation, and a 90 day extension -to their trial license is granted. - -By the time the extension period has elapsed, spaCy has helped them secure -funding, and they even have a little revenue. They are glad to pay the $5,000 -commercial license fee. - -spaCy is now permanently licensed for the product Ashley and Casey are -developing. They own the copyright to any modifications they make to spaCy, -but not to the original spaCy code. - -No additional fees will be due when they hire new developers, run spaCy on -additional internal servers, etc. If their company is acquired, the license will -be transferred to the company acquiring them. However, to use spaCy in another -product, they will have to buy a second license. - - -Alex and Sasha: University Academics -#################################### - -Alex and Sasha are post-doctoral researchers working for a university. Part of -their funding comes from a grant from Google, but Google will not own any part -of the work that they produce. Their mission is just to write papers. - -Alex and Sasha find spaCy convenient, so they use it in their system under the -AGPL. This means that their system must also be released under the AGPL, but they're -cool with that --- they were going to release their code anyway, as it's the only -way to ensure their experiments are properly repeatable. - -Alex and Sasha find and fix a few bugs in spaCy. They must release these -modifications, and they ask that they be accepted into the main spaCy repo. -In order to do this, they must sign a contributor agreement, ceding their -copyright. When commercial licenses to spaCy are sold, Alex and Sasha will -not be able to claim any royalties from their contributions. - -Later, Alex and Sasha implement new features into spaCy, for another paper. The -code was quite rushed, and they don't want to take the time to put together a -proper pull request. They must release their modifications under the AGPL, but -they are not obliged to contribute it to the spaCy repository, or concede their -copyright. - - -Phuong and Jessie: Open Source developers -######################################### - -Phuong and Jessie use the open-source software Calibre to manage their e-book -libraries. They have an idea for a search feature, and they want to use spaCy -to implement it. Calibre is released under the GPLv3. The AGPL has additional -restrictions for projects used as a network resource, but they don't apply to -this project, so Phuong and Jessie can use spaCy to improve Calibre. They'll -have to release their code, but that was always their intention anyway. diff --git a/docs/source/quickstart.rst b/docs/source/quickstart.rst deleted file mode 100644 index ba1c24626..000000000 --- a/docs/source/quickstart.rst +++ /dev/null @@ -1,236 +0,0 @@ -Quick Start -=========== - - -Install -------- - -.. py:currentmodule:: spacy - - -With Python 2.7 or Python 3, using Linux or OSX, run: - -.. code:: bash - - $ pip install spacy - $ python -m spacy.en.download - -.. _300 mb of data: http://s3-us-west-1.amazonaws.com/media.spacynlp.com/en_data_all-0.4.tgz - - -The download command fetches and installs about 300mb of data, for the -parser model and word vectors, which it installs within the spacy.en package directory. - -If you're stuck using a server with an old version of Python, and you don't -have root access, I've prepared a bootstrap script to help you compile a local -Python install. Run: - -.. code:: bash - - $ curl https://raw.githubusercontent.com/honnibal/spaCy/master/bootstrap_python_env.sh | bash && source .env/bin/activate - -The other way to install the package is to clone the github repository, and -build it from source. This installs an additional dependency, Cython. -If you're using Python 2, I also recommend installing fabric and fabtools --- -this is how I build the project. - -.. code:: bash - - $ git clone https://github.com/honnibal/spaCy.git - $ cd spaCy - $ virtualenv .env && source .env/bin/activate - $ export PYTHONPATH=`pwd` - $ pip install -r requirements.txt - $ python setup.py build_ext --inplace - $ python -m spacy.en.download - $ pip install pytest - $ py.test tests/ - -Python packaging is awkward at the best of times, and it's particularly tricky -with C extensions, built via Cython, requiring large data files. So, please -report issues as you encounter them, and bear with me :) - -Usage ------ - -The main entry-point is :meth:`en.English.__call__`, which accepts a unicode string -as an argument, and returns a :py:class:`tokens.Doc` object. You can -iterate over it to get :py:class:`tokens.Token` objects, which provide -a convenient API: - - >>> from __future__ import unicode_literals # If Python 2 - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp(u'I ate the pizza with anchovies.') - >>> pizza = tokens[3] - >>> (pizza.orth, pizza.orth_, pizza.head.lemma, pizza.head.lemma_) - ... (14702, u'pizza', 14702, u'eat') - -spaCy maps all strings to sequential integer IDs --- a common trick in NLP. -If an attribute `Token.foo` is an integer ID, then `Token.foo_` is the string, -e.g. `pizza.orth` and `pizza.orth_` provide the integer ID and the string of -the original orthographic form of the word. - - .. note:: en.English.__call__ is stateful --- it has an important **side-effect**. - - When it processes a previously unseen word, it increments the ID counter, - assigns the ID to the string, and writes the mapping in - :py:data:`English.vocab.strings` (instance of - :py:class:`strings.StringStore`). - Future releases will feature a way to reconcile mappings, but for now, you - should only work with one instance of the pipeline at a time. - - -(Most of the) API at a glance ------------------------------ - -**Process the string:** - - .. py:class:: spacy.en.English(self, data_dir=join(dirname(__file__), 'data')) - - .. py:method:: __call__(self, text: unicode, tag=True, parse=True, entity=True, merge_mwes=False) --> Doc - - +-----------------+--------------+--------------+ - | Attribute | Type | Its API | - +=================+==============+==============+ - | vocab | Vocab | __getitem__ | - +-----------------+--------------+--------------+ - | vocab.strings | StingStore | __getitem__ | - +-----------------+--------------+--------------+ - | tokenizer | Tokenizer | __call__ | - +-----------------+--------------+--------------+ - | tagger | EnPosTagger | __call__ | - +-----------------+--------------+--------------+ - | parser | GreedyParser | __call__ | - +-----------------+--------------+--------------+ - | entity | GreedyParser | __call__ | - +-----------------+--------------+--------------+ - -**Get dict or numpy array:** - - .. py:method:: tokens.Doc.to_array(self, attr_ids: List[int]) --> ndarray[ndim=2, dtype=long] - - .. py:method:: tokens.Doc.count_by(self, attr_id: int) --> Dict[int, int] - -**Get Token objects** - - .. py:method:: tokens.Doc.__getitem__(self, i) --> Token - - .. py:method:: tokens.Doc.__iter__(self) --> Iterator[Token] - -**Get sentence or named entity spans** - - .. py:attribute:: tokens.Doc.sents --> Iterator[Span] - - .. py:attribute:: tokens.Doc.ents --> Iterator[Span] - - You can iterate over a Span to access individual Doc, or access its - start, end or label. - - -**Embedded word representenations** - - .. py:attribute:: tokens.Token.repvec - - .. py:attribute:: lexeme.Lexeme.repvec - - -**Navigate to tree- or string-neighbor tokens** - - .. py:method:: nbor(self, i=1) --> Token - - .. py:method:: child(self, i=1) --> Token - - .. py:method:: sibling(self, i=1) --> Token - - .. py:attribute:: head: Token - - .. py:attribute:: dep: int - -**Align to original string** - - .. py:attribute:: string: unicode - - Padded with original whitespace. - - .. py:attribute:: length: int - - Length, in unicode code-points. Equal to len(self.orth_). - - .. py:attribute:: idx: int - - Starting offset of word in the original string. - - -Features --------- - - -**Boolean features** - - >>> lexeme = nlp.vocab[u'Apple'] - >>> lexeme.is_alpha, is_upper - True, False - >>> tokens = nlp('Apple computers') - >>> tokens[0].is_alpha, tokens[0].is_upper - >>> True, False - >>> from spacy.en.attrs import IS_ALPHA, IS_UPPER - >>> tokens.to_array((IS_ALPHA, IS_UPPER))[0] - array([1, 0]) - - +----------+---------------------------------------------------------------+ - | is_alpha | :py:meth:`str.isalpha` | - +----------+---------------------------------------------------------------+ - | is_digit | :py:meth:`str.isdigit` | - +----------+---------------------------------------------------------------+ - | is_lower | :py:meth:`str.islower` | - +----------+---------------------------------------------------------------+ - | is_title | :py:meth:`str.istitle` | - +----------+---------------------------------------------------------------+ - | is_upper | :py:meth:`str.isupper` | - +----------+---------------------------------------------------------------+ - | is_ascii | all(ord(c) < 128 for c in string) | - +----------+---------------------------------------------------------------+ - | is_punct | all(unicodedata.category(c).startswith('P') for c in string) | - +----------+---------------------------------------------------------------+ - | like_url | Using various heuristics, does the string resemble a URL? | - +----------+---------------------------------------------------------------+ - | like_num | "Two", "10", "1,000", "10.54", "1/2" etc all match | - +----------+---------------------------------------------------------------+ - -**String-transform Features** - - - +----------+---------------------------------------------------------------+ - | orth | The original string, unmodified. | - +----------+---------------------------------------------------------------+ - | lower | The original string, forced to lower-case | - +----------+---------------------------------------------------------------+ - | norm | The string after additional normalization | - +----------+---------------------------------------------------------------+ - | shape | Word shape, e.g. 10 --> dd, Garden --> Xxxx, Hi!5 --> Xx!d | - +----------+---------------------------------------------------------------+ - | prefix | A short slice from the start of the string. | - +----------+---------------------------------------------------------------+ - | suffix | A short slice from the end of the string. | - +----------+---------------------------------------------------------------+ - | lemma | The word's lemma, i.e. morphological suffixes removed | - +----------+---------------------------------------------------------------+ - -**Syntactic labels** - - +----------+---------------------------------------------------------------+ - | pos | The word's part-of-speech, from the Google Universal Tag Set | - +----------+---------------------------------------------------------------+ - | tag | A fine-grained morphosyntactic tag, e.g. VBZ, NNS, etc | - +----------+---------------------------------------------------------------+ - | dep | Dependency type label between word and its head, e.g. subj | - +----------+---------------------------------------------------------------+ - -**Distributional** - - +---------+-----------------------------------------------------------+ - | cluster | Brown cluster ID of the word | - +---------+-----------------------------------------------------------+ - | prob | Log probability of word, smoothed with Simple Good-Turing | - +---------+-----------------------------------------------------------+ diff --git a/docs/source/reference/annotation.rst b/docs/source/reference/annotation.rst deleted file mode 100644 index c19e70bbd..000000000 --- a/docs/source/reference/annotation.rst +++ /dev/null @@ -1,116 +0,0 @@ -==================== -Annotation Standards -==================== - -This document describes the target annotations spaCy is trained to predict. - -This is currently a work in progress. Please ask questions on the issue tracker, -so that the answers can be integrated here to improve the documentation. - -https://github.com/honnibal/spaCy/issues - -English -======= - -Tokenization ------------- - -Tokenization standards are based on the OntoNotes 5 corpus. - -The tokenizer differs from most by including tokens for significant whitespace. -Any sequence of whitespace characters beyond a single space (' ') is included -as a token. For instance: - - >>> from spacy.en import English - >>> nlp = English(parse=False) - >>> tokens = nlp(u'Some\nspaces and\ttab characters') - >>> print [t.orth_ for t in tokens] - [u'Some', u'\n', u'spaces', u' ', u'and', u'\t', u'tab', u'characters'] - -The whitespace tokens are useful for much the same reason punctuation is --- it's -often an important delimiter in the text. By preserving it in the token output, -we are able to maintain a simple alignment between the tokens and the original -string, and we ensure that the token stream does not lose information. - -Sentence boundary detection ---------------------------- - -Sentence boundaries are calculated from the syntactic parse tree, so features -such as punctuation and capitalisation play an important but non-decisive role -in determining the sentence boundaries. Usually this means that the sentence -boundaries will at least coincide with clause boundaries, even given poorly -punctuated text. - -Part-of-speech Tagging ----------------------- - -The part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank -tag set. We also map the tags to the simpler Google Universal POS Tag set. - -Details here: https://github.com/honnibal/spaCy/blob/master/spacy/en/pos.pyx#L124 - -Lemmatization -------------- - -A "lemma" is the uninflected form of a word. In English, this means: - -* Adjectives: The form like "happy", not "happier" or "happiest" -* Adverbs: The form like "badly", not "worse" or "worst" -* Nouns: The form like "dog", not "dogs"; like "child", not "children" -* Verbs: The form like "write", not "writes", "writing", "wrote" or "written" - -The lemmatization data is taken from WordNet. However, we also add a special -case for pronouns: all pronouns are lemmatized to the special token -PRON-. - -Syntactic Dependency Parsing ----------------------------- - -The parser is trained on data produced by the ClearNLP converter. Details of -the annotation scheme can be found here: - -http://www.mathcs.emory.edu/~choi/doc/clear-dependency-2012.pdf - -Named Entity Recognition ------------------------- - - +--------------+-----------------------------------------------------+ - | PERSON | People, including fictional | - +--------------+-----------------------------------------------------+ - | NORP | Nationalities or religious or political groups | - +--------------+-----------------------------------------------------+ - | FACILITY | Buildings, airports, highways, bridges, etc. | - +--------------+-----------------------------------------------------+ - | ORGANIZATION | Companies, agencies, institutions, etc. | - +--------------+-----------------------------------------------------+ - | GPE | Countries, cities, states | - +--------------+-----------------------------------------------------+ - | LOCATION | Non-GPE locations, mountain ranges, bodies of water | - +--------------+-----------------------------------------------------+ - | PRODUCT | Vehicles, weapons, foods, etc. (Not services) | - +--------------+-----------------------------------------------------+ - | EVENT | Named hurricanes, battles, wars, sports events, etc.| - +--------------+-----------------------------------------------------+ - | WORK OF ART | Titles of books, songs, etc. | - +--------------+-----------------------------------------------------+ - | LAW | Named documents made into laws | - +--------------+-----------------------------------------------------+ - | LANGUAGE | Any named language | - +--------------+-----------------------------------------------------+ - -The following values are also annotated in a style similar to names: - - +--------------+---------------------------------------------+ - | DATE | Absolute or relative dates or periods | - +--------------+---------------------------------------------+ - | TIME | Times smaller than a day | - +--------------+---------------------------------------------+ - | PERCENT | Percentage (including “%”) | - +--------------+---------------------------------------------+ - | MONEY | Monetary values, including unit | - +--------------+---------------------------------------------+ - | QUANTITY | Measurements, as of weight or distance | - +--------------+---------------------------------------------+ - | ORDINAL | "first", "second" | - +--------------+---------------------------------------------+ - | CARDINAL | Numerals that do not fall under another type| - +--------------+---------------------------------------------+ diff --git a/docs/source/reference/index.rst b/docs/source/reference/index.rst deleted file mode 100644 index 0d0e9cdf2..000000000 --- a/docs/source/reference/index.rst +++ /dev/null @@ -1,112 +0,0 @@ -============= -Documentation -============= - -The table below shows every class in spaCy: a link to its documentation, implementation, -and a small usage snippet. - - - +----------------+--------------------------+--------------------------------+ - | Class name | Usage | Implemention | - +================+==========================+================================+ - | `English`_ | doc = English() | `spacy/en/__init__.py`_ | - +----------------+--------------------------+--------------------------------+ - | Data objects | - +----------------+--------------------------+--------------------------------+ - | `Doc`_ | doc = nlp(text) | `spacy/doc.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Token`_ | token = doc[10] | `spacy/token.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Span`_ | sent = doc.sents.next() | `spacy/span.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Lexeme`_ | lex = nlp.vocab[u'word'] | `spacy/lexeme.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Lookup tables | - +----------------+--------------------------+--------------------------------+ - | `Vocab`_ | nlp.vocab | `spacy/vocab.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `StringStore`_ | nlp.vocab.strings | `spacy/strings.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Processing modules | - +----------------+--------------------------+--------------------------------+ - | `Tokenizer`_ | nlp.tokenizer | `spacy/tokenizer.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `EnPosTagger`_ | nlp.tagger | `spacy/en/pos.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Parser`_ | nlp.parser | `spacy/syntax/parser.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | Parser internals | - +----------------+--------------------------+--------------------------------+ - | ArcEager | | spacy/syntax/arc_eager.pyx | - +----------------+--------------------------+--------------------------------+ - | BiluoPushDown | | spacy/syntax/ner.pyx | - +----------------+--------------------------+--------------------------------+ - | StateClass | | spacy/syntax/stateclass.pyx | - +----------------+--------------------------+--------------------------------+ - | Research Utilities | - +----------------+--------------------------+--------------------------------+ - | `GoldParse`_ | | `spacy/gold.pyx`_ | - +----------------+--------------------------+--------------------------------+ - | `Scorer`_ | | `spacy/scorer.py`_ | - +----------------+--------------------------+--------------------------------+ - - -.. toctree:: - :maxdepth: 4 - - processing.rst - using/document.rst - using/span.rst - using/token.rst - using/lexeme.rst - lookup.rst - - -.. _English: processing.html - -.. _Doc: using/doc.html - -.. _Token: using/token.html - -.. _Span: using/span.html - -.. _Vocab: lookup.html - -.. _StringStore: lookup.html - -.. _Tokenizer: processing.html - -.. _EnPosTagger: processing.html - -.. _Parser: processing.html - -.. _Lexeme: lookup.html - -.. _Scorer: misc.html - -.. _GoldParse: misc.html - - -.. _spacy/en/__init__.py: https://github.com/honnibal/spaCy/tree/master/spacy/en/__init__.py - -.. _spacy/doc.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx - -.. _spacy/token.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokens.pyx - -.. _spacy/span.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/spans.pyx - -.. _spacy/vocab.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/vocab.pyx - -.. _spacy/strings.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/strings.pyx - -.. _spacy/tokenizer.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/tokenizer.pyx - -.. _spacy/en/pos.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/en/pos.pyx - -.. _spacy/syntax/parser.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/syntax/parser.pyx - -.. _spacy/lexeme.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/lexeme.pyx - -.. _spacy/gold.pyx: https://github.com/honnibal/spaCy/tree/master/spacy/gold.pyx - -.. _spacy/scorer.py: https://github.com/honnibal/spaCy/tree/master/spacy/scorer.py diff --git a/docs/source/reference/loading.rst b/docs/source/reference/loading.rst deleted file mode 100644 index 15a8d7427..000000000 --- a/docs/source/reference/loading.rst +++ /dev/null @@ -1,41 +0,0 @@ -================= -Loading Resources -================= -In more detail: - -.. code:: - - class English(object): - def __init__(self, - data_dir=path.join(path.dirname(__file__), 'data'), - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=Createarser(ArcEager), - Entity=CreateParser(BiluoNER), - load_vectors=True - ): - -:code:`data_dir` - :code:`unicode path` - - The data directory. May be None, to disable any data loading (including - the vocabulary). - -:code:`Tokenizer` - :code:`(Vocab vocab, unicode data_dir)(unicode) --> Doc` - - A class/function that creates the tokenizer. - -:code:`Tagger` / :code:`Parser` / :code:`Entity` - :code:`(Vocab vocab, unicode data_dir)(Doc) --> None` - - A class/function that creates the part-of-speech tagger / - syntactic dependency parser / named entity recogniser. - May be None or False, to disable tagging. - -:code:`load_vectors` - :code:`bool` - A boolean value to control whether the word vectors are loaded. - - - diff --git a/docs/source/reference/lookup.rst b/docs/source/reference/lookup.rst deleted file mode 100644 index 340cdb6c1..000000000 --- a/docs/source/reference/lookup.rst +++ /dev/null @@ -1,111 +0,0 @@ -Lexical Lookup --------------- - -Where possible, spaCy computes information over lexical *types*, rather than -*tokens*. If you process a large batch of text, the number of unique types -you will see will grow exponentially slower than the number of tokens --- so -it's much more efficient to compute over types. And, in small samples, we generally -want to know about the distribution of a word in the language at large --- -which again, is type-based information. - -You can access the lexical features via the Token object, but you can also look them -up in the vocabulary directly: - - >>> from spacy.en import English - >>> nlp = English() - >>> lexeme = nlp.vocab[u'Amazon'] - -.. py:class:: vocab.Vocab(self, data_dir=None, lex_props_getter=None) - - .. py:method:: __len__(self) - - :returns: number of words in the vocabulary - :rtype: int - - .. py:method:: __getitem__(self, key_int) - - :param int key: - Integer ID - - :returns: A Lexeme object - - .. py:method:: __getitem__(self, key_str) - - :param unicode key_str: - A string in the vocabulary - - :rtype: Lexeme - - - .. py:method:: __setitem__(self, orth_str, props) - - :param unicode orth_str: - The orth key - - :param dict props: - A props dictionary - - :returns: None - - .. py:method:: dump(self, loc) - - :param unicode loc: - Path where the vocabulary should be saved - - .. py:method:: load_lexemes(self, loc) - - :param unicode loc: - Path to load the lexemes.bin file from - - .. py:method:: load_vectors(self, loc) - - :param unicode loc: - Path to load the vectors.bin from - - -.. py:class:: strings.StringStore(self) - - .. py:method:: __len__(self) - - :returns: - Number of strings in the string-store - - .. py:method:: __getitem__(self, key_int) - - :param int key_int: An integer key - - :returns: - The string that the integer key maps to - - :rtype: unicode - - .. py:method:: __getitem__(self, key_unicode) - - :param int key_unicode: - A key, as a unicode string - - :returns: - The integer ID of the string. - - :rtype: int - - .. py:method:: __getitem__(self, key_utf8_bytes) - - :param int key_utf8_bytes: - A key, as a UTF-8 encoded byte-string - - :returns: - The integer ID of the string. - - :rtype: - int - - .. py:method:: dump(self, loc) - - :param loc: - File path to save the strings.txt to. - - .. py:method:: load(self, loc) - - :param loc: - File path to load the strings.txt from. diff --git a/docs/source/reference/processing.rst b/docs/source/reference/processing.rst deleted file mode 100644 index ec8e8ebca..000000000 --- a/docs/source/reference/processing.rst +++ /dev/null @@ -1,89 +0,0 @@ -================ -spacy.en.English -================ - - -99\% of the time, you will load spaCy's resources using a language pipeline class, -e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a -specified directory. By default, spaCy installs data into each language's -package directory, and loads it from there. - -Usually, this is all you will need: - - >>> from spacy.en import English - >>> nlp = English() - -If you need to replace some of the components, you may want to just make your -own pipeline class --- the English class itself does almost no work; it just -applies the modules in order. You can also provide a function or class that -produces a tokenizer, tagger, parser or entity recognizer to :code:`English.__init__`, -to customize the pipeline: - - >>> from spacy.en import English - >>> from my_module import MyTagger - >>> nlp = English(Tagger=MyTagger) - -The text processing API is very small and simple. Everything is a callable object, -and you will almost always apply the pipeline all at once. - - -.. py:class:: spacy.en.English - - .. py:method:: __init__(self, data_dir=..., Tokenizer=..., Tagger=..., Parser=..., Entity=..., Matcher=..., Packer=None, load_vectors=True) - - :param unicode data_dir: - The data directory. May be None, to disable any data loading (including - the vocabulary). - - :param Tokenizer: - A class/function that creates the tokenizer. - - :param Tagger: - A class/function that creates the part-of-speech tagger. - - :param Parser: - A class/function that creates the dependency parser. - - :param Entity: - A class/function that creates the named entity recogniser. - - :param bool load_vectors: - A boolean value to control whether the word vectors are loaded. - - .. py:method:: __call__(text, tag=True, parse=True, entity=True) --> Doc - - :param unicode text: - The text to be processed. No pre-processing needs to be applied, and any - length of text can be submitted. Usually you will submit a whole document. - Text may be zero-length. An exception is raised if byte strings are supplied. - - :param bool tag: - Whether to apply the part-of-speech tagger. Required for parsing and entity - recognition. - - :param bool parse: - Whether to apply the syntactic dependency parser. - - :param bool entity: - Whether to apply the named entity recognizer. - - :return: A document - :rtype: :py:class:`spacy.tokens.Doc` - - :Example: - - >>> from spacy.en import English - >>> nlp = English() - >>> doc = nlp(u'Some text.) # Applies tagger, parser, entity - >>> doc = nlp(u'Some text.', parse=False) # Applies tagger and entity, not parser - >>> doc = nlp(u'Some text.', entity=False) # Applies tagger and parser, not entity - >>> doc = nlp(u'Some text.', tag=False) # Does not apply tagger, entity or parser - >>> doc = nlp(u'') # Zero-length tokens, not an error - >>> doc = nlp(b'Some text') # Error: need unicode - Traceback (most recent call last): - File "", line 1, in - File "spacy/en/__init__.py", line 128, in __call__ - tokens = self.tokenizer(text) - TypeError: Argument 'string' has incorrect type (expected unicode, got str) - >>> doc = nlp(b'Some text'.decode('utf8')) # Encode to unicode first. - >>> diff --git a/docs/source/reference/using/document.rst b/docs/source/reference/using/document.rst deleted file mode 100644 index 7507f7f21..000000000 --- a/docs/source/reference/using/document.rst +++ /dev/null @@ -1,94 +0,0 @@ -============== -The Doc Object -============== - - -.. py:class:: spacy.tokens.doc.Doc - - .. py:method:: __init__(self, Vocab vocab, orths_and_spaces=None) - - :param Vocab vocab: A vocabulary object. - - :param list orths_and_spaces=None: Defaults to None. - - .. py:method:: __getitem__(self, int i) - - :returns: Token - - .. py:method:: __getitem__(self, slice start_colon_end) - - :returns: Span - - .. py:method:: __iter__(self) - - Iterate over tokens - - .. code:: - - >>> tokens = nlp(u'Zero one two three four five six') - >>> tokens[0].orth_ - u'Zero' - >>> tokens[-1].orth_ - u'six' - - .. py:method:: __len__(self) - - Number of tokens - - .. py:attribute:: sents - - Iterate over sentences in the document. - - :returns generator: Sentences - - .. py:attribute:: ents - - Iterate over named entities in the document. - - :returns tuple: Named Entities - - .. py:attribute:: noun_chunks - - :returns generator: - - .. py:method:: to_array(self, list attr_ids) - - Given a list of M attribute IDs, export the tokens to a numpy ndarray - of shape N*M, where N is the length of the sentence. - - :param list[int] attr_ids: A list of attribute ID ints. - - :returns feat_array: - A feature matrix, with one row per word, and one column per attribute - indicated in the input attr_ids. - - .. py:method:: count_by(self, attr_id) - - Produce a dict of {attribute (int): count (ints)} frequencies, keyed - by the values of the given attribute ID. - - .. code:: - - >>> from spacy.en import English, attrs - >>> nlp = English() - >>> tokens = nlp(u'apple apple orange banana') - >>> tokens.count_by(attrs.ORTH) - {12800L: 1, 11880L: 2, 7561L: 1} - >>> tokens.to_array([attrs.ORTH]) - array([[11880], - [11880], - [ 7561], - [12800]]) - - .. py:method:: from_array(self, attrs, array) - - .. py:method:: to_bytes(self) - - .. py:method:: from_bytes(self) - - .. py:method:: read_bytes(self) - - .. py:method:: merge(self, int start_idx, int end_idx, unicode tag, unicode lemma, unicode ent_type) - - Merge a multi-word expression into a single token. Currently - experimental; API is likely to change. diff --git a/docs/source/reference/using/index.rst b/docs/source/reference/using/index.rst deleted file mode 100644 index cf8b0cde2..000000000 --- a/docs/source/reference/using/index.rst +++ /dev/null @@ -1,11 +0,0 @@ -================== -Annotation Objects -================== - - -.. toctree:: - :maxdepth: 3 - - document.rst - token.rst - span.rst diff --git a/docs/source/reference/using/span.rst b/docs/source/reference/using/span.rst deleted file mode 100644 index c3c78a68f..000000000 --- a/docs/source/reference/using/span.rst +++ /dev/null @@ -1,58 +0,0 @@ -=============== -The Span Object -=============== - -.. autoclass:: spacy.spans.Span - -.. py:class:: Span - - - .. py:method:: __getitem__ - - .. py:method:: __iter__ - - .. py:method:: __len__ - - .. py:attribute:: root - - Syntactic head - - .. py:attribute:: lefts - - Tokens that are: - - 1. To the left of the span; - 2. Syntactic children of words within the span - - i.e. - - .. code:: - - lefts = [span.doc[i] for i in range(0, span.start) if span.doc[i].head in span] - - .. py:attribute:: rights - - Tokens that are: - - 1. To the right of the span; - 2. Syntactic children of words within the span - - i.e. - - .. code:: - - rights = [span.doc[i] for i in range(span.end, len(span.doc)) if span.doc[i].head in span] - - Tokens that are: - - 1. To the right of the span; - 2. Syntactic children of words within the span - - - .. py:attribute:: string - - .. py:attribute:: lemma / lemma\_ - - .. py:attribute:: label / label\_ - - .. py:attribute:: subtree diff --git a/docs/source/reference/using/token.rst b/docs/source/reference/using/token.rst deleted file mode 100644 index 721e78820..000000000 --- a/docs/source/reference/using/token.rst +++ /dev/null @@ -1,195 +0,0 @@ -================ -The Token Object -================ - -A Token represents a single word, punctuation or significant whitespace symbol. - -Integer IDs are provided for all string features. The (unicode) string is -provided by an attribute of the same name followed by an underscore, e.g. -token.orth is an integer ID, token.orth\_ is the unicode value. - -The only exception is the Token.string attribute, which is (unicode) -string-typed. - - -.. py:class:: Token - - .. py:method:: __init__(self, Vocab vocab, Doc doc, int offset) - - **String Views** - - .. py:attribute:: orth / orth\_ - - The form of the word with no string normalization or processing, as it - appears in the string, without trailing whitespace. - - .. py:attribute:: lemma / lemma\_ - - The "base" of the word, with no inflectional suffixes, e.g. the lemma of - "developing" is "develop", the lemma of "geese" is "goose", etc. Note that - *derivational* suffixes are not stripped, e.g. the lemma of "instutitions" - is "institution", not "institute". Lemmatization is performed using the - WordNet data, but extended to also cover closed-class words such as - pronouns. By default, the WN lemmatizer returns "hi" as the lemma of "his". - We assign pronouns the lemma -PRON-. - - .. py:attribute:: lower / lower\_ - - The form of the word, but forced to lower-case, i.e. lower = word.orth\_.lower() - - .. py:attribute:: norm / norm\_ - - The form of the word, after language-specific normalizations have been - applied. - - .. py:attribute:: shape / shape\_ - - A transform of the word's string, to show orthographic features. The - characters a-z are mapped to x, A-Z is mapped to X, 0-9 is mapped to d. - After these mappings, sequences of 4 or more of the same character are - truncated to length 4. Examples: C3Po --> XdXx, favorite --> xxxx, - :) --> :) - - .. py:attribute:: prefix / prefix\_ - - A length-N substring from the start of the word. Length may vary by - language; currently for English n=1, i.e. prefix = word.orth\_[:1] - - .. py:attribute:: suffix / suffix\_ - - A length-N substring from the end of the word. Length may vary by - language; currently for English n=3, i.e. suffix = word.orth\_[-3:] - - .. py:attribute:: lex_id - - **Alignment and Output** - - .. py:attribute:: idx - - .. py:method:: __len__(self) - - .. py:method:: __unicode__(self) - - .. py:method:: __str__(self) - - .. py:attribute:: string - - The form of the word as it appears in the string, **including trailing - whitespace**. This is useful when you need to use linguistic features to - add inline mark-up to the string. - - .. py:method:: nbor(self, int i=1) - - **Distributional Features** - - .. py:attribute:: repvec - - A "word embedding" representation: a dense real-valued vector that supports - similarity queries between words. By default, spaCy currently loads - vectors produced by the Levy and Goldberg (2014) dependency-based word2vec - model. - - .. py:attribute:: cluster - - The Brown cluster ID of the word. These are often useful features for - linear models. If you're using a non-linear model, particularly - a neural net or random forest, consider using the real-valued word - representation vector, in Token.repvec, instead. - - .. py:attribute:: prob - - The unigram log-probability of the word, estimated from counts from a - large corpus, smoothed using Simple Good Turing estimation. - - **Navigating the Dependency Tree** - - .. py:attribute:: pos / pos\_ - - A part-of-speech tag, from the Google Universal Tag Set, e.g. NOUN, VERB, - ADV. Constants for the 17 tag values are provided in spacy.parts\_of\_speech. - - .. py:attribute:: tag / tag\_ - - A morphosyntactic tag, e.g. NN, VBZ, DT, etc. These tags are - language/corpus specific, and typically describe part-of-speech and some - amount of morphological information. For instance, in the Penn Treebank - tag set, VBZ is assigned to a present-tense singular verb. - - .. py:attribute:: dep / dep\_ - - The type of syntactic dependency relation between the word and its - syntactic head. - - .. py:attribute:: head - - The Token that is the immediate syntactic head of the word. If the word is - the root of the dependency tree, the same word is returned. - - .. py:attribute:: lefts - - An iterator for the immediate leftward syntactic children of the word. - - .. py:attribute:: rights - - An iterator for the immediate rightward syntactic children of the word. - - .. py:attribute:: n_lefts - - The number of immediate syntactic children preceding the word in the - string. - - .. py:attribute:: n_rights - - The number of immediate syntactic children following the word in the - string. - - .. py:attribute:: children - - An iterator that yields from lefts, and then yields from rights. - - .. py:attribute:: subtree - - An iterator for the part of the sentence syntactically governed by the - word, including the word itself. - - .. py:attribute:: left_edge - - .. py:attribute:: right_edge - - .. py:attribute:: conjuncts - - **Named Entities** - - .. py:attribute:: ent_type - - If the token is part of an entity, its entity type - - .. py:attribute:: ent_iob - - The IOB (inside, outside, begin) entity recognition tag for the token - - **Lexeme Flags** - - .. py:method:: check_flag(self, attr_id_t flag_id) - - .. py:attribute:: is_oov - - .. py:attribute:: is_alpha - - .. py:attribute:: is_ascii - - .. py:attribute:: is_digit - - .. py:attribute:: is_lower - - .. py:attribute:: is_title - - .. py:attribute:: is_punct - - .. py:attribute:: is_space - - .. py:attribute:: like_url - - .. py:attribute:: like_num - - .. py:attribute:: like_email diff --git a/docs/source/tutorials/lexrank_tutorial.rst b/docs/source/tutorials/lexrank_tutorial.rst deleted file mode 100644 index f5c5ae8fd..000000000 --- a/docs/source/tutorials/lexrank_tutorial.rst +++ /dev/null @@ -1,280 +0,0 @@ -=================================== -Tutorial: Extractive Summarization -=================================== - -This tutorial will go through the implementation of several extractive -summarization models with spaCy. - -An *extractive* summarization system is a filter over the original document/s: -most of the text is removed, and the remaining text is formatted as a summary. -In contrast, an *abstractive* summarization system generates new text. - -Application Context -------------------- - -Extractive summarization systems need an application context. We can't ask how -to design the system without some concept of what sort of summary will be -useful for a given application. (Contrast with speech recognition, where -a notion of "correct" is much less application-sensitive.) - -For this, I've adopted the application context that `Flipboard`_ discuss in a -recent blog post: they want to display lead-text to readers on mobile devices, -so that readers can easily choose interesting links. - -I've chosen this application context for two reasons. First, `Flipboard`_ say -they're putting something like this into production. Second, there's a ready -source of evaluation data. We can look at the lead-text that human editors -have chosen, and evaluate whether our automatic system chooses similar text. - -Experimental Setup ------------------- - -Instead of scraping data, I'm using articles from the New York Times Annotated -Corpus, which is a handy dump of XML-annotated articles distributed by the LDC. -The annotations come with a field named "online lead paragraph". Our -summarization systems will be evaluated on their Rouge-1 overlap with this -field. - -Further details of the experimental setup can be found in the appendices. - -.. _newyorktimes.com: http://newyorktimes.com - -.. _Flipboard: http://engineering.flipboard.com/2014/10/summarization/ - -.. _vector-space model: https://en.wikipedia.org/wiki/Vector_space_model - -.. _LexRank algorithm: https://www.cs.cmu.edu/afs/cs/project/jair/pub/volume22/erkan04a-html/erkan04a.html - -.. _PageRank: https://en.wikipedia.org/wiki/PageRank - -Summarizer API --------------- - -Each summarization model will have the following API: - -.. py:func:`summarize(nlp: spacy.en.English, headline: unicode, paragraphs: List[unicode], - target_length: int) --> summary: unicode - -We receive the headline and a list of paragraphs, and a target length. We have -to produce a block of text where len(text) < target_length. We want summaries -that users will click-on, and not bounce back out of. Long-term, we want -summaries that would keep people using the app. - -Baselines: Truncate -------------------- - -.. code:: python - - def truncate_chars(nlp, headline, paragraphs, target_length): - text = ' '.join(paragraphs) - return text[:target_length - 3] + '...' - - def truncate_words(nlp, headline, paragraphs, target_length): - text = ' '.join(paragraphs) - tokens = text.split() - summary = [] - n_words = 0 - n_chars = 0 - while n_chars < target_length - 3: - n_chars += len(tokens[n_words]) - n_chars += 1 # Space - n_words += 1 - return ' '.join(tokens[:n_words]) + '...' - - def truncate_sentences(nlp, headline, paragraphs, target_length): - sentences = [] - summary = '' - for para in paragraphs: - tokens = nlp(para) - for sentence in tokens.sentences(): - if len(summary) + len(sentence) >= target_length: - return summary - summary += str(sentence) - return summary - -I'd be surprised if Flipboard never had something like this in production. Details -like lead-text take a while to float up the priority list. This strategy also has -the advantage of transparency: it's obvious to users how the decision is being -made, so nobody is likely to complain about the feature if it works this way. - -Instead of cutting off the text mid-word, we can tokenize the text, and - -+----------------+-----------+ -| System | Rouge-1 R | -+----------------+-----------+ -| Truncate chars | 69.3 | -+----------------+-----------+ -| Truncate words | 69.8 | -+----------------+-----------+ -| Truncate sents | 48.5 | -+----------------+-----------+ - -Sentence Vectors ----------------- - -A simple bag-of-words model can be created using the `count_by` method, which -produces a dictionary of frequencies, keyed by string IDs: - -.. code:: python - - >>> from spacy.en import English - >>> from spacy.en.attrs import SIC - >>> nlp = English() - >>> tokens = nlp(u'a a a. b b b b.') - >>> tokens.count_by(SIC) - {41L: 4, 11L: 3, 5L: 2} - >>> [s.count_by(SIC) for s in tokens.sentences()] - [{11L: 3, 5L: 1}, {41L: 4, 5L: 1}] - - -Similar functionality is provided by `scikit-learn`_, but with a different -style of API design. With spaCy, functions generally have more limited -responsibility. The advantage of this is that spaCy's APIs are much simpler, -and it's often easier to compose functions in a more flexible way. - -One particularly powerful feature of spaCy is its support for -`word embeddings`_ --- the dense vectors introduced by deep learning models, and -now commonly produced by `word2vec`_ and related systems. - -Once a set of word embeddings has been installed, the vectors are available -from any token: - - >>> from spacy.en import English - >>> from spacy.en.attrs import SIC - >>> from scipy.spatial.distance import cosine - >>> nlp = English() - >>> tokens = nlp(u'Apple banana Batman hero') - >>> cosine(tokens[0].vec, tokens[1].vec) - - - - - -.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ - -.. _word2vec: https://code.google.com/p/word2vec/ - -.. code:: python - - def main(db_loc, output_dir, feat_type="tfidf"): - nlp = spacy.en.English() - - # Read stop list and make TF-IDF weights --- data needed for the - # feature extraction. - with open(stops_loc) as file_: - stop_words = set(nlp.vocab.strings[word.strip()] for word in file_) - idf_weights = get_idf_weights(nlp, iter_docs(db_loc)) - if feat_type == 'tfidf': - feature_extractor = tfidf_extractor(stop_words, idf_weights) - elif feat_type == 'vec': - feature_extractor = vec_extractor(stop_words, idf_weights) - - for i, text in enumerate(iter_docs(db_loc)): - tokens = nlp(body) - sentences = tokens.sentences() - summary = summarize(sentences, feature_extractor) - write_output(summary, output_dir, i) - - - - -.. _scikit-learn: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.feature_extraction.text - - - - - -The LexRank Algorithm ----------------------- - -LexRank is described as a graph-based algorithm, derived from `Google's PageRank`_. -The nodes are sentences, and the edges are the similarities between one -sentence and another. The "graph" is fully-connected, and its edges are -undirected --- so, it's natural to represent this as a matrix: - -.. code:: python - - from scipy.spatial.distance import cosine - import numpy - - - def lexrank(sent_vectors): - n = len(sent_vectors) - # Build the cosine similarity matrix - matrix = numpy.ndarray(shape=(n, n)) - for i in range(n): - for j in range(n): - matrix[i, j] = cosine(sent_vectors[i], sent_vectors[j]) - # Normalize - for i in range(n): - matrix[i] /= sum(matrix[i]) - return _pagerank(matrix) - -The rows are normalized (i.e. rows sum to 1), allowing the PageRank algorithm -to be applied. Unfortunately the PageRank implementation is rather opaque --- -it's easier to just read the Wikipedia page: - -.. code:: python - - def _pagerank(matrix, d=0.85): - # This is admittedly opaque --- just read the Wikipedia page. - n = len(matrix) - rank = numpy.ones(shape=(n,)) / n - new_rank = numpy.zeros(shape=(n,)) - while not _has_converged(rank, new_rank): - rank, new_rank = new_rank, rank - for i in range(n): - new_rank[i] = ((1.0 - d) / n) + (d * sum(rank * matrix[i])) - return rank - - def _has_converged(x, y, epsilon=0.0001): - return all(abs(x[i] - y[i]) < epsilon for i in range(n)) - - -Initial Processing ------------------- - - - - -Feature Extraction ------------------- - - .. code:: python - def sentence_vectors(sentence, idf_weights): - tf_idf = {} - for term, freq in sent.count_by(LEMMA).items(): - tf_idf[term] = freq * idf_weights[term] - vectors.append(tf_idf) - return vectors - -The LexRank paper models each sentence as a bag-of-words - -This is simple and fairly standard, but often gives -underwhelming results. My idea is to instead calculate vectors from -`word-embeddings`_, which have been one of the exciting outcomes of the recent -work on deep-learning. I had a quick look at the literature, and found -a `recent workshop paper`_ that suggested the idea was plausible. - - - - -Taking the feature representation and similarity function as parameters, the -LexRank function looks like this: - - -Given a list of N sentences, a function that maps a sentence to a feature -vector, and a function that computes a similarity measure of two feature -vectors, this produces a vector of N floats, which indicate how well each -sentence represents the document as a whole. - -.. _Rouge: https://en.wikipedia.org/wiki/ROUGE_%28metric%29 - - -.. _word embeddings: https://colah.github.io/posts/2014-07-NLP-RNNs-Representations/ - -.. _recent workshop paper: https://www.aclweb.org/anthology/W/W14/W14-1504.pdf - - -Document Model --------------- diff --git a/docs/source/updates.rst b/docs/source/updates.rst deleted file mode 100644 index 7e298abf1..000000000 --- a/docs/source/updates.rst +++ /dev/null @@ -1,233 +0,0 @@ -Updates -======= - -To update your installation: - -.. code:: bash - - $ pip install --upgrade spacy - $ python -m spacy.en.download all - -Most updates ship a new model, so you will usually have to redownload the data. - -v0.89 ------ - -* Fix regression in parse times on very long texts. Recent versions were - calculating parse features in a way that was polynomial in input length. -* Add tag SP (coarse tag SPACE) for whitespace tokens. Ensure entity recogniser - does not assign entities to whitespace. -* Rename :code:`Span.head` to :code:`Span.root`, fix its documentation, and make - it more efficient. I considered adding Span.head, Span.dep and Span.dep\_ as - well, but for now I leave these as accessible via :code:`Span.root.head`, - :code:`Span.head.dep`, and :code:`Span.head.dep\_`, to keep the API smaller. - - -2015-07-08 v0.88 ----------------- - -Refactoring release. - -If you have the data for v0.87, you don't need to redownload the data for this -release. - -* You can now set tag=False, parse=False or entity=False when creating the pipleine, - to disable some of the models. See the documentation for details. -* Models no longer lazy-loaded. -* Warning emitted when parse=True or entity=True but model not loaded. -* Rename the tokens.Tokens class to tokens.Doc. An alias has been made to assist - backwards compatibility, but you should update your code to refer to the new - class name. -* Various bits of internal refactoring - - -2015-07-01 v0.87 ----------------- - -* Changed weights data structure. Memory use should be reduced 30-40%. -* Fixed speed regressions introduced in the last few versions. -* Models should now be slightly more robust to noise in the input text, as I'm - now training on data with a small amount of noise added, e.g. I randomly corrupt - capitalization, swap spaces for newlines, etc. This is bringing a small - benefit on out-of-domain data. I think this strategy could yield better - results with a better noise-generation function. If you think you have a good - way to make clean text resemble the kind of noisy input you're seeing in your - domain, get in touch. - -2015-06-24 v0.86 ----------------- - -* Parser now more accurate, using novel non-monotonic transition system that's - currently under review. - - -2015-05-12 v0.85 ----------------- - -* Parser produces richer dependency labels following the `ClearNLP scheme`_ -* Training data now includes text from a variety of genres. -* Parser now uses more memory and the data is slightly larger, due to the additional - labels. Impact on efficiency is minimal: entire process still takes - <10ms per document. - -Most users should see a substantial increase in accuracy from the new model. -Long post on accuracy evaluation and model details coming soon. - -.. _ClearNLP scheme: https://github.com/clir/clearnlp-guidelines/blob/master/md/dependency/dependency_guidelines.md - - -2015-05-12 v0.84 ----------------- - -* Bug fixes for parsing -* Bug fixes for named entity recognition - -2015-04-13 v0.80 ----------------- - -* Preliminary support for named-entity recognition. Its accuracy is substantially behind the state-of-the-art. I'm working on improvements. - -* Better sentence boundary detection, drawn from the syntactic structure. - -* Lots of bug fixes. - -2015-03-05 v0.70 ----------------- - -* Improved parse navigation API -* Bug fixes to labelled parsing - - -2015-01-30 spaCy v0.4: Still alpha, improving quickly ------------------------------------------------------ - -Five days ago I presented the alpha release of spaCy, a natural language -processing library that brings state-of-the-art technology to small companies. - -spaCy has been well received, and there are now a lot of eyes on the project. -Naturally, lots of issues have surfaced. I'm grateful to those who've reported -them. I've worked hard to address them as quickly as I could. - -Bug Fixes ----------- - -* Lexemes.bin data file had a platform-specific encoding. - This was a silly error: instead of the string, or an index into the - list of strings, I was storing the 64-bit hash of the string. On - wide-unicode builds, a unicode string hashes differently. This meant that - all look-ups into the vocabulary failed on wide unicode builds, which - further meant that the part-of-speech tagger and parser features were not - computed correctly. - - The fix is simple: we already have to read in a list of all the strings, so - just store an index into that list, instead of a hash. - -* Parse tree navigation API was rough, and buggy. - The parse-tree navigation API was the last thing I added before v0.3. I've - now replaced it with something better. The previous API design was flawed, - and the implementation was buggy --- Token.child() and Token.head were - sometimes inconsistent. - - I've addressed the most immediate problems, but this part of the design is - still a work in progress. It's a difficult problem. The parse is a tree, - and we want to freely navigate up and down it without creating reference - cycles that inhibit garbage collection, and without doing a lot of copying, - creating and deleting. - - I think I've got a promising solution to this, but I suspect there's - currently a memory leak. Please get in touch no the tracker if you want to - know more, especially if you think you can help. - -Known Issues ------------- - -Some systems are still experiencing memory errors, which I'm having trouble -pinning down or reproducing. Please send details of your system to the -`Issue Tracker`_ if this is happening to you. - -.. _Issue Tracker: https://github.com/honnibal/spaCy/issues - -Enhancements: Train and evaluate on whole paragraphs ----------------------------------------------------- - -.. note:: tl;dr: I shipped the wrong parsing model with 0.3. That model expected input to be segmented into sentences. 0.4 ships the correct model, which uses some algorithmic tricks to minimize the impact of tokenization and sentence segmentation errors on the parser. - - -Most English parsing research is performed on text with perfect pre-processing: -one newline between every sentence, one space between every token. -It's always been done this way, and it's good. It's a useful idealisation, -because the pre-processing has few algorithmic implications. - -But, for practical performance, this stuff can matter a lot. -Dridan and Oepen (2013) did a simple but rare thing: they actually ran a few -parsers on raw text. Even on the standard Wall Street Journal corpus, -where pre-processing tools are quite good, the quality of pre-processing -made a big difference: - - +-------------+-------+----------+ - | Preprocess | BLLIP | Berkeley | - +-------------+-------+----------+ - | Gold | 90.9 | 89.8 | - +-------------+-------+----------+ - | Default | 86.4 | 88.4 | - +-------------+-------+----------+ - | Corrected | 89.9 | 88.8 | - +-------------+-------+----------+ - -.. note:: spaCy is evaluated on unlabelled dependencies, where the above accuracy figures refer to phrase-structure trees. Accuracies are non-comparable. - - - -In the standard experimental condition --- gold pre-processing --- the -BLLIP parser is better. But, it turns out it ships with lousy pre-processing -tools: when you evaluate the parsers on raw text, the BLLIP parser falls way -behind. To verify that this was due to the quality of the pre-processing -tools, and not some particular algorithmic sensitivity, Dridan and Oepen ran -both parsers with their high-quality tokenizer and sentence segmenter. This -confirmed that with equal pre-processing, the BLLIP parser is better. - -The Dridan and Oepen paper really convinced me to take pre-processing seriously -in spaCy. In fact, spaCy started life as just a tokenizer --- hence the name. - -The spaCy parser has a special trick up its sleeve. Because both the tagger -and parser run in linear time, it doesn't require that the input be divided -into sentences. This is nice because it avoids error-cascades: if you segment -first, then the parser just has to live with whatever decision the segmenter -made. - -But, even though I designed the system with this consideration in mind, -I decided to present the initial results using the standard methodology, using -gold-standard inputs. But...then I made a mistake. - -Unfortunately, with all the other things I was doing before launch, I forgot -all about this problem. spaCy launched with a parsing model that expected the -input to be segmented into sentences, but with no sentence segmenter. This -caused a drop in parse accuracy of 4%! - -Over the last five days, I've worked hard to correct this. I implemented the -modifications to the parsing algorithm I had planned, from Dongdong Zhang et al. -(2013), and trained and evaluated the parser on raw text, using the version of -the WSJ distributed by Read et al. (2012), and used in Dridan and Oepen's -experiments. - -I'm pleased to say that on the WSJ at least, spaCy 0.4 performs almost exactly -as well on raw text as text with gold-standard tokenization and sentence -boundary detection. - -I still need to evaluate this on web text, and I need to compare against the -Stanford CoreNLP and other parsers. I suspect that most other parsers will -decline in accuracy by 1% --- we'll see. - - -+-------------+---------+ -| Preprocess | spaCy | -+-------------+---------+ -| Gold | 92.4% | -+-------------+---------+ -| Default | 92.2% | -+-------------+---------+ - -2015-01-25 ----------- - -spaCy v0.33 launched --- first alpha build. From f9a6bea74602a88717be624b270b96eb672cac3e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:12:07 +0200 Subject: [PATCH 076/138] * Ignore keys and other things --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 2644ca342..dc6568914 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,8 @@ MANIFEST corpora/ models/ +examples/ +keys/ spacy/syntax/*.cpp spacy/syntax/*.html From 692a8d3e3c6a03556097953d3bd5ec300a920c7f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:12:26 +0200 Subject: [PATCH 077/138] * Begin rewriting twitter_filter examples --- examples/twitter_filter.py | 143 +++++-------------------------------- 1 file changed, 19 insertions(+), 124 deletions(-) diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py index f842acdd4..b6e4e4e83 100644 --- a/examples/twitter_filter.py +++ b/examples/twitter_filter.py @@ -1,140 +1,35 @@ +# encoding: utf8 from __future__ import unicode_literals, print_function import plac import codecs -import sys -import math +import pathlib +import random +import twython import spacy.en -from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ -from termcolor import colored -from twython import TwythonStreamer - -from os import path -from math import sqrt - -from numpy import dot -from numpy.linalg import norm +import _handler -class Meaning(object): - def __init__(self, vectors): - if vectors: - self.vector = sum(vectors) / len(vectors) - self.norm = norm(self.vector) - else: - self.vector = None - self.norm = 0 - - @classmethod - def from_path(cls, nlp, loc): - with codecs.open(loc, 'r', 'utf8') as file_: - terms = file_.read().strip().split() - return cls.from_terms(nlp, terms) - - @classmethod - def from_tokens(cls, nlp, tokens): - vectors = [t.repvec for t in tokens] - return cls(vectors) - - @classmethod - def from_terms(cls, nlp, examples): - lexemes = [nlp.vocab[eg] for eg in examples] - vectors = [eg.repvec for eg in lexemes] - return cls(vectors) - - def similarity(self, other): - if not self.norm or not other.norm: - return -1 - return dot(self.vector, other.vector) / (self.norm * other.norm) - - -def print_colored(model, stream=sys.stdout): - if model['is_match']: - color = 'green' - elif model['is_reject']: - color = 'red' - else: - color = 'grey' - - if not model['is_rare'] and model['is_match'] and not model['is_reject']: - match_score = colored('%.3f' % model['match_score'], 'green') - reject_score = colored('%.3f' % model['reject_score'], 'red') - prob = '%.5f' % model['prob'] - - print(match_score, reject_score, prob) - print(repr(model['text']), color) - print('') - - -class TextMatcher(object): - def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): +class Connection(twython.TwythonStreamer): + def __init__(self, keys_dir, nlp, query): + keys_dir = pathlib.Path(keys_dir) + read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip() + api_key = map(read, ['key', 'secret', 'token', 'token_secret']) + twython.TwythonStreamer.__init__(self, *api_key) self.nlp = nlp - self.get_target = get_target - self.get_reject = get_reject - self.min_prob = min_prob - self.min_match = min_match - self.max_reject = max_reject - - def __call__(self, text): - tweet = self.nlp(text) - target_terms = self.get_target() - reject_terms = self.get_reject() - - prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) - meaning = Meaning.from_tokens(self, tweet) - - match_score = meaning.similarity(self.get_target()) - reject_score = meaning.similarity(self.get_reject()) - return { - 'text': tweet.string, - 'prob': prob, - 'match_score': match_score, - 'reject_score': reject_score, - 'is_rare': prob < self.min_prob, - 'is_match': prob >= self.min_prob and match_score >= self.min_match, - 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject - } - - -class Connection(TwythonStreamer): - def __init__(self, keys_dir, handler, view): - keys = Secrets(keys_dir) - TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) - self.handler = handler - self.view = view + self.query = query def on_success(self, data): - text = data.get('text', u'') - # Twython returns either bytes or unicode, depending on tweet. - # #APIshaming - try: - model = self.handler(text) - except TypeError: - model = self.handler(text.decode('utf8')) - status = self.view(model, sys.stdin) - - def on_error(self, status_code, data): - print(status_code) + _handler.handle_tweet(self.nlp, data, self.query) + if random.random() >= 0.1: + reload(_handler) -class Secrets(object): - def __init__(self, key_dir): - self.key = open(path.join(key_dir, 'key.txt')).read().strip() - self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() - self.token = open(path.join(key_dir, 'token.txt')).read().strip() - self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() - - -def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): - # We don't need the parser for this demo, so may as well save the loading time - nlp = spacy.en.English(Parser=None) - get_target = lambda: Meaning.from_path(nlp, target_loc) - get_reject = lambda: Meaning.from_path(nlp, reject_loc) - matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) - - twitter = Connection(keys_dir, matcher, print_colored) - twitter.statuses.filter(track=term) +def main(keys_dir, term): + nlp = spacy.en.English() + twitter = Connection(keys_dir, nlp, term) + twitter.statuses.filter(track=term, language='en') if __name__ == '__main__': From aa12b374c0ed82b04f8e2f9688683f5040524fb1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 22 Aug 2015 22:12:55 +0200 Subject: [PATCH 078/138] * Remove old doc tests --- tests/test_docs.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index c5307b5a0..70c8b8c63 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- """Sphinx doctest is just too hard. Manually paste doctest examples here""" -from spacy.en.attrs import IS_LOWER -import pytest @pytest.mark.models def test_1(): From 3879d28457ecf66f37e1e9c0da8ec29661144e52 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Aug 2015 02:40:35 +0200 Subject: [PATCH 079/138] * Fix https for url detection --- spacy/orth.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 6ffac839b..ca4bbd9ba 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu cpdef bint like_url(unicode string): # We're looking for things that function in text like URLs. So, valid URL # or not, anything they say http:// is going to be good. - if string.startswith('http://'): + if string.startswith('http://') or string.startswith('https://'): return True elif string.startswith('www.') and len(string) >= 5: return True From 6f1743692add1507b76b30ac6b347c662467446f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Aug 2015 20:49:18 +0200 Subject: [PATCH 080/138] * Work on language-independent refactoring --- spacy/en/__init__.py | 2 ++ spacy/lexeme.pxd | 33 ++++++++++++++++++++++++++-- spacy/lexeme.pyx | 9 +++----- spacy/matcher.pyx | 7 +++--- spacy/orth.pyx | 1 + spacy/strings.pyx | 2 ++ spacy/tokens/doc.pyx | 6 ++--- spacy/tokens/token.pyx | 27 ++++++++++++----------- spacy/vocab.pxd | 5 +++-- spacy/vocab.pyx | 50 +++++++++++++++++++++--------------------- 10 files changed, 88 insertions(+), 54 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index a04b615da..3d433e497 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False): 'sentiment': 0 } +get_lex_attr = {} + if_model_present = -1 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 321f7c616..510840b2b 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray @@ -15,7 +16,8 @@ cdef class Lexeme: cdef readonly Vocab vocab cdef readonly attr_t orth - cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + @staticmethod + cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: lex.length = props['length'] lex.orth = vocab.strings[props['orth']] lex.lower = vocab.strings[props['lower']] @@ -29,7 +31,6 @@ cdef class Lexeme: lex.sentiment = props['sentiment'] lex.flags = props['flags'] - lex.repvec = empty_vec @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -55,6 +56,34 @@ cdef class Lexeme: return lex.cluster else: return 0 + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value + @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + @staticmethod + cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: + cdef flags_t one = 1 + if value: + lex.flags |= one << flag_id + else: + lex.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0b3303f1..4deec60c1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,12 +26,9 @@ cdef class Lexeme: def __init__(self, Vocab vocab, int orth): self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(orth) + self.c = vocab.get_by_orth(vocab.mem, orth) + assert self.c.orth == orth - property orth: - def __get__(self): - return self.c.orth - property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -113,7 +110,7 @@ cdef class Lexeme: def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 72473b073..9d1220648 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -103,20 +103,21 @@ cdef class Matcher: def __init__(self, vocab, patterns): self.mem = Pool() + self.vocab = vocab for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] + entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): - etype = vocab.strings[etype] + etype = self.vocab.strings[etype] elif etype is None: etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: - spec = _convert_strings(spec, vocab.strings) + spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod diff --git a/spacy/orth.pyx b/spacy/orth.pyx index ca4bbd9ba..df4e2dc32 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -92,6 +92,7 @@ cpdef bint like_url(unicode string): return False +# TODO: This should live in the language.orth NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eighteen nineteen twenty thirty forty fifty sixty seventy' diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c187a6aa6..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -142,6 +142,8 @@ cdef class StringStore: def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) + if strings == ['']: + return None cdef unicode string cdef bytes byte_string for string in strings: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..0fa562dfb 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr +from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: @@ -218,6 +217,7 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + assert t.lex.orth != 0 t.spacy = has_space self.length += 1 self._py_tokens.append(None) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..04945ecd1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV +from ..lexeme cimport Lexeme + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -42,7 +43,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -286,37 +287,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..cf7a46388 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,15 +27,16 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef readonly object pos_tags cdef readonly int length cdef public object _serializer cdef public object data_dir - cdef public float oov_prob + cdef public object get_lex_attr + cdef public object pos_tags cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef PreshMap _by_hash diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dcb7d575c..4c35ea41c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,7 +12,6 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape @@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None): + def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} - self.get_lex_attr = get_lex_attr self.repvec_length = 0 - self.length = 0 - self._add_lex_to_vocab(0, &EMPTY_LEXEME) + self.length = 1 + self.pos_tags = pos_tags if data_dir is not None: if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) @@ -84,7 +81,10 @@ cdef class Vocab: cdef LexemeC* lex cdef hash_t key = hash_string(string) lex = self._by_hash.get(key) + cdef size_t addr if lex != NULL: + print string, lex.orth, self.strings[string] + assert lex.orth == self.strings[string] return lex else: return self._new_lexeme(mem, string) @@ -103,15 +103,24 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef hash_t key cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem + mem = self.mem + #if len(string) < 3: + # mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - for attr, func in self.lex_attr_getters.items(): - Lexeme.set_struct_attr(lex, attr, func(string)) + lex.orth = self.strings[string] + lex.id = self.length + if self.get_lex_attr is not None: + for attr, func in self.get_lex_attr.items(): + value = func(string) + if isinstance(value, unicode): + value = self.strings[value] + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: + key = hash_string(string) self._add_lex_to_vocab(key, lex) assert lex != NULL, string return lex @@ -119,13 +128,14 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) + print "Add lex", key, lex.orth, self.strings[lex.orth] self.length += 1 def __iter__(self): cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme(self, orth) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -142,22 +152,12 @@ cdef class Vocab: An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef const LexemeC* lexeme cdef attr_t orth - if type(id_or_string) == int: - orth = id_or_string - lexeme = self._by_orth.get(orth) - if lexeme == NULL: - raise KeyError(id_or_string) - assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) - elif type(id_or_string) == unicode: - lexeme = self.get(self.mem, id_or_string) - assert lexeme.orth == self.strings[id_or_string] + if type(id_or_string) == unicode: + orth = self.strings[id_or_string] else: - raise ValueError("Vocab unable to map type: " - "%s. Maps unicode --> Lexeme or " - "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) + orth = id_or_string + return Lexeme(self, orth) def dump(self, loc): if path.exists(loc): From 5d5922dbfaf160ef40f9ec62743fe51db1f86700 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 01:04:30 +0200 Subject: [PATCH 081/138] * Begin laying out morphological features --- spacy/morphology.pxd | 721 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 721 insertions(+) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dfee4250..6914eb8d6 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -2,3 +2,724 @@ from .structs cimport TokenC, Morphology, PosTag cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 + + +cdef enum Feature: + Abbr + AdpType + AdvType + ConjType + Connegative + Derivation + Echo + Foreign + Gender_dat + Gender_erg + Gender_psor + Hyph + InfForm + NameType + NounType + NumberAbs + NumberDat + NumberErg + NumberPsee + NumberPsor + NumForm + NumValue + PartForm + PartType + Person_abs + Person_dat + Person_psor + Polite + Polite_abs + Polite_dat + Prefix + PrepCase + PunctSide + PunctType + Style + Typo + Variant + VerbType +cpdef enum Animacy: + Anim + Inam + + +cpdef enum Aspect: + Freq + Imp + Mod + None_ + Perf + + +cpdef enum Case1: + Abe + Abl + Abs + Acc + Ade + All + Cau + Com + +cdef enum Case2: + Dat + Del + Dis + Ela + Ess + Gen + Ill + Ine + +cdef enum Case3: + Ins + Loc + Lat + Nom + Par + Sub + Sup + Tem + Ter + + +cdef enum Case4: + Tra + Voc + + +cpdef enum Definite: + Two + Def + Red + Ind + + +cpdef enum Degree: + Cmp + Comp + None_ + Pos + Sup + Abs + Com + Degree # du + + +cpdef enum Gender: + Com + Fem + Masc + Neut + + +cpdef enum Mood: + Cnd + Imp + Ind + N + Pot + Sub + Opt + + +cpdef enum Negative: + Neg + Pos + Yes + + +cpdef enum Number: + Com + Dual + None_ + Plur + Sing + Ptan # bg + Count # bg + + +cpdef enum NumType: + Card + Dist + Frac + Gen + Mult + None_ + Ord + Sets + + +cpdef enum Person: + One + Two + Three + None_ + + +cpdef enum Poss: + Yes + + +cpdef enum PronType1: + AdvPart + Art + Default + Dem + Ind + Int + Neg + +cpdef enum PronType2: + Prs + Rcp + Rel + Tot + Clit + Exc # es, ca, it, fa + Clit # it + + +cpdef enum Reflex: + Yes + + +cpdef enum Tense: + Fut + Imp + Past + Pres + +cpdef enum VerbForm1: + Fin + Ger + Inf + None_ + Part + PartFut + PartPast + +cpdef enum VerbForm2: + PartPres + Sup + Trans + Gdv # la + + +cpdef enum Voice: + Act + Cau + Pass + Mid # gkc + Int # hb + + +cpdef enum Abbr: + Yes # cz, fi, sl, U + +cpdef enum AdpType: + Prep # cz, U + Post # U + Voc # cz + Comprep # cz + Circ # U + Voc # U + + +cpdef enum AdvType1: + # U + Man + Loc + Tim + Deg + Cau + Mod + Sta + Ex + +cpdef enum AdvType2: + Adadj + +cpdef enum ConjType: + Oper # cz, U + Comp # cz, U + +cpdef enum Connegative: + Yes # fi + + +cpdef enum Derivation1: + Minen # fi + Sti # fi + Inen # fi + Lainen # fi + Ja # fi + Ton # fi + Vs # fi + Ttain # fi + +cpdef enum Derivation2: + Ttaa + + +cpdef enum Echo: + Rdp # U + Ech # U + + +cpdef enum Foreign: + Foreign # cz, fi, U + Fscript # cz, fi, U + Tscript # cz, U + Yes # sl + + +cpdef enum Gender_dat: + Masc # bq, U + Fem # bq, U + + +cpdef enum Gender_erg: + Masc # bq + Fem # bq + + +cpdef enum Gender_psor: + Masc # cz, sl, U + Fem # cz, sl, U + Neut # sl + + +cpdef enum Hyph: + Yes # cz, U + + +cpdef enum InfForm: + One # fi + Two # fi + Three # fi + + +cpdef enum NameType: + Geo # U, cz + Prs # U, cz + Giv # U, cz + Sur # U, cz + Nat # U, cz + Com # U, cz + Pro # U, cz + Oth # U, cz + + +cpdef enum NounType: + Com # U + Prop # U + Class # U + +cpdef enum Number_abs: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_dat: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_erg: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_psee: + Sing # U + Plur # U + + +cpdef enum Number_psor: + Sing # cz, fi, sl, U + Plur # cz, fi, sl, U + + +cpdef enum NumForm: + Digit # cz, sl, U + Roman # cz, sl, U + Word # cz, sl, U + + +cpdef enum NumValue: + One # cz, U + Two # cz, U + Three # cz, U + + +cpdef enum PartForm: + Pres # fi + Past # fi + Agt # fi + Neg # fi + + +cpdef enum PartType: + Mod # U + Emp # U + Res # U + Inf # U + Vbp # U + +cpdef enum Person_abs: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_dat: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_erg: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_psor: + One # fi, U + Two # fi, U + Three # fi, U + + +cpdef enum Polite: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_abs: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_erg: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_dat: + Inf # bq, U + Pol # bq, U + + +cpdef enum Prefix: + Yes # U + + +cpdef enum PrepCase: + Npr # cz + Pre # U + + +cpdef enum PunctSide: + Ini # U + Fin # U + +cpdef enum PunctType1: + Peri # U + Qest # U + Excl # U + Quot # U + Brck # U + Comm # U + Colo # U + Semi # U + +cpdef enum PunctType2: + Dash # U + + +cpdef enum Style1: + Arch # cz, fi, U + Rare # cz, fi, U + Poet # cz, U + Norm # cz, U + Coll # cz, U + Vrnc # cz, U + Sing # cz, U + Expr # cz, U + + +cpdef enum Style2: + Derg # cz, U + Vulg # cz, U + + +cpdef enum Typo: + Yes # fi, U + + +cpdef enum Variant: + Short # cz + Bound # cz, sl + + +cpdef enum VerbType: + Aux # U + Cop # U + Mod # U + Light # U + + +cpdef enum FeatureValues: + Animacy_Anim + Animacy_Inam + Aspect_Freq + Aspect_Imp + Aspect_Mod + Aspect_None_ + Aspect_Perf + Case_Abe + Case_Abl + Case_Abs + Case_Acc + Case_Ade + Case_All + Case_Cau + Case_Com + Case_Dat + Case_Del + Case_Dis + Case_Ela + Case_Ess + Case_Gen + Case_Ill + Case_Ine + Case_Ins + Case_Loc + Case_Lat + Case_Nom + Case_Par + Case_Sub + Case_Sup + Case_Tem + Case_Ter + Case_Tra + Case_Voc + Definite_Two + Definite_Def + Definite_Red + Definite_Ind + Degree_Cmp + Degree_Comp + Degree_None + Degree_Pos + Degree_Sup + Degree_Abs + Degree_Com + Degree_Dim # du + Gender_Com + Gender_Fem + Gender_Masc + Gender_Neut + Mood_Cnd + Mood_Imp + Mood_Ind + Mood_N + Mood_Pot + Mood_Sub + Mood_Opt + Negative_Neg + Negative_Pos + Negative_Yes + Number_Com + Number_Dual + Number_None + Number_Plur + Number_Sing + Number_Ptan # bg + Number_Count # bg + NumType_Card + NumType_Dist + NumType_Frac + NumType_Gen + NumType_Mult + NumType_None + NumType_Ord + NumType_Sets + Person_One + Person_Two + Person_Three + Person_None + Poss_Yes + PronType_AdvPart + PronType_Art + PronType_Default + PronType_Dem + PronType_Ind + PronType_Int + PronType_Neg + PronType_Prs + PronType_Rcp + PronType_Rel + PronType_Tot + PronType_Clit + PronType_Exc # es, ca, it, fa + PronType_Clit # it + Reflex_Yes + Tense_Fut + Tense_Imp + Tense_Past + Tense_Pres + VerbForm_Fin + VerbForm_Ger + VerbForm_Inf + VerbForm_None + VerbForm_Part + VerbForm_PartFut + VerbForm_PartPast + VerbForm_PartPres + VerbForm_Sup + VerbForm_Trans + VerbForm_Gdv # la + Voice_Act + Voice_Cau + Voice_Pass + Voice_Mid # gkc + Voice_Int # hb + Abbr_Yes # cz, fi, sl, U + AdpType_Prep # cz, U + AdpType_Post # U + AdpType_Voc # cz + AdpType_Comprep # cz + AdpType_Circ # U + AdpType_Voc # U + AdvType_Man + AdvType_Loc + AdvType_Tim + AdvType_Deg + AdvType_Cau + AdvType_Mod + AdvType_Sta + AdvType_Ex + AdvType_Adadj + ConjType_Oper # cz, U + ConjType_Comp # cz, U + Connegative_Yes # fi + # fi + Derivation_Minen + Derivation_Sti + Derivation_Inen + Derivation_Lainen + Derivation_Ja + Derivation_Ton + Derivation_Vs + Derivation_Ttain + Derivation_Ttaa + Echo_Rdp # U + Echo_Ech # U + Foreign_Foreign # cz, fi, U + Foreign_Fscript # cz, fi, U + Foreign_Tscript # cz, U + Foreign_Yes # sl + Gender_dat_Masc # bq, U + Gender_dat_Fem # bq, U + Gender_erg_Masc # bq + Gender_erg_Fem # bq + Gender_psor_Masc # cz, sl, U + Gender_psor_Fem # cz, sl, U + Gender_psor_Neut # sl + Hyph_Yes # cz, U + InfForm_One # fi + InfForm_Two # fi + InfForm_Three # fi + NameType_Geo # U, cz + NameType_Prs # U, cz + NameType_Giv # U, cz + NameType_Sur # U, cz + NameType_Nat # U, cz + NameType_Com # U, cz + NameType_Pro # U, cz + NameType_Oth # U, cz + NounType_Com # U + NounType_Prop # U + NounType_Class # U + Number_abs_Sing # bq, U + Number_abs_Plur # bq, U + Number_dat_Sing # bq, U + Number_dat_Plur # bq, U + Number_erg_Sing # bq, U + Number_erg_Plur # bq, U + Number_psee_Sing # U + Number_psee_Plur # U + Number_psor_Sing # cz, fi, sl, U + Number_psor_Plur # cz, fi, sl, U + NumForm_Digit # cz, sl, U + NumForm_Roman # cz, sl, U + NumForm_Word # cz, sl, U + NumValue_One # cz, U + NumValue_Two # cz, U + NumValue_Three # cz, U + PartForm_Pres # fi + PartForm_Past # fi + PartForm_Agt # fi + PartForm_Neg # fi + PartType_Mod # U + PartType_Emp # U + PartType_Res # U + PartType_Inf # U + PartType_Vbp # U + Person_abs_One # bq, U + Person_abs_Two # bq, U + Person_abs_Three # bq, U + Person_dat_One # bq, U + Person_dat_Two # bq, U + Person_dat_Three # bq, U + Person_erg_One # bq, U + Person_erg_Two # bq, U + Person_erg_Three # bq, U + Person_psor_One # fi, U + Person_psor_Two # fi, U + Person_psor_Three # fi, U + Polite_Inf # bq, U + Polite_Pol # bq, U + Polite_abs_Inf # bq, U + Polite_abs_Pol # bq, U + Polite_erg_Inf # bq, U + Polite_erg_Pol # bq, U + Polite_dat_Inf # bq, U + Polite_dat_Pol # bq, U + Prefix_Yes # U + PrepCase_Npr # cz + PrepCase_Pre # U + PunctSide_Ini # U + PunctSide_Fin # U + PunctType_Peri # U + PunctType_Qest # U + PunctType_Excl # U + PunctType_Quot # U + PunctType_Brck # U + PunctType_Comm # U + PunctType_Colo # U + PunctType_Semi # U + PunctType_Dash # U + Style_Arch # cz, fi, U + Style_Rare # cz, fi, U + Style_Poet # cz, U + Style_Norm # cz, U + Style_Coll # cz, U + Style_Vrnc # cz, U + Style_Sing # cz, U + Style_Expr # cz, U + Style_Derg # cz, U + Style_Vulg # cz, U + Style_Yes # fi, U + StyleVariant_StyleShort # cz + StyleVariant_StyleBound # cz, sl + VerbType_Aux # U + VerbType_Cop # U + VerbType_Mod # U + VerbType_Light # U + + From bbf07ac253e12cdc2ec76dcdde46f5bc6c7dd51b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 01:05:20 +0200 Subject: [PATCH 082/138] * Cut down init_model to work on more languages --- bin/init_model.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 3307bffa8..9a635f296 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -29,8 +29,6 @@ from shutil import copytree import codecs from collections import defaultdict -from spacy.en import get_lex_props -from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.strings import hash_string @@ -38,6 +36,11 @@ from preshed.counter import PreshCounter from spacy.parts_of_speech import NOUN, VERB, ADJ +import spacy.en +import spacy.de + + + def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): @@ -139,7 +142,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(src_dir, dst_dir): +def setup_vocab(get_lex_attr, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -148,13 +151,13 @@ def setup_vocab(src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) + vocab = Vocab(data_dir=None, get_lex_attr=get_lex_attr) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: - oov_prob = 0.0 + oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: @@ -163,23 +166,30 @@ def setup_vocab(src_dir, dst_dir): lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - entry = get_lex_props(word) - entry['prob'] = float(prob) - cluster = clusters.get(word, '0') + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx - entry['cluster'] = int(cluster[::-1], 2) - vocab[word] = entry + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob) -def main(lang_data_dir, corpora_dir, model_dir): +def main(lang_id, lang_data_dir, corpora_dir, model_dir): + languages = { + 'en': spacy.en.get_lex_attr, + 'de': spacy.en.get_lex_attr + } + model_dir = Path(model_dir) - lang_data_dir = Path(lang_data_dir) - corpora_dir = Path(corpora_dir) + lang_data_dir = Path(lang_data_dir) / lang_id + corpora_dir = Path(corpora_dir) / lang_id assert corpora_dir.exists() assert lang_data_dir.exists() @@ -188,12 +198,12 @@ def main(lang_data_dir, corpora_dir, model_dir): model_dir.mkdir() setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(corpora_dir, model_dir / 'vocab') + setup_vocab(languages[lang_id], corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) - if not (model_dir / 'wordnet').exists(): + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) From 5dd76be4460b2d08ab9384c7142452e84f797ee3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 05:25:55 +0200 Subject: [PATCH 083/138] * Split EnPosTagger up into base class and subclass --- setup.py | 2 +- spacy/en/pos.pxd | 27 +------- spacy/en/pos.pyx | 169 +++++++---------------------------------------- spacy/tagger.pxd | 27 ++++++++ spacy/tagger.pyx | 144 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 199 insertions(+), 170 deletions(-) create mode 100644 spacy/tagger.pxd create mode 100644 spacy/tagger.pyx diff --git a/setup.py b/setup.py index 218272504..fe55d0d5a 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def main(modules, is_pypy): MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', + 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', 'spacy.tokenizer', 'spacy.en.attrs', diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 2fc7b4ac7..213752cf5 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,26 +1,5 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - -from .._ml cimport Model -from ..strings cimport StringStore -from ..structs cimport TokenC, LexemeC, Morphology, PosTag -from ..parts_of_speech cimport univ_pos_t -from .lemmatizer import Lemmatizer +from ..tagger cimport Tagger -cdef class EnPosTagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model - cdef public object lemmatizer - cdef PreshMapArray _morph_cache - cdef public dict freqs - - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map - cdef readonly int n_tags - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 +cdef class EnPosTagger(Tagger): + pass diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 569b209fc..703d7198c 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -218,155 +218,34 @@ POS_TEMPLATES = ( ) -cdef struct _CachedMorph: - Morphology morph - int lemma - - -def setup_model_dir(tag_names, tag_map, templates, model_dir): - if path.exists(model_dir): - shutil.rmtree(model_dir) - os.mkdir(model_dir) - config = { - 'templates': templates, - 'tag_names': tag_names, - 'tag_map': tag_map - } - with open(path.join(model_dir, 'config.json'), 'w') as file_: - json.dump(config, file_) - - -cdef class EnPosTagger: +cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" - def __init__(self, StringStore strings, data_dir): - self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 - - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - Args: - tokens (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - cdef int i + def make_lemmatizer(self, data_dir): + return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) + + cdef int predict(self, int i, const TokenC* tokens) except -1: cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - for i in range(tokens.length): - if tokens.data[i].pos == 0: - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def train(self, Doc tokens, object gold_tag_strs): - cdef int i - cdef int loss + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] - correct = 0 - for i in range(tokens.length): - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != golds[i] if golds[i] != -1 else 0 - self.model.update(context, guess, golds[i], loss) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - correct += loss == 0 - self.freqs[TAG][tokens.data[i].tag] += 1 - return correct - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) - - -cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess + cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd new file mode 100644 index 000000000..4aa9acc43 --- /dev/null +++ b/spacy/tagger.pxd @@ -0,0 +1,27 @@ +from preshed.maps cimport PreshMapArray +from preshed.counter cimport PreshCounter +from cymem.cymem cimport Pool + +from ._ml cimport Model +from .strings cimport StringStore +from .structs cimport TokenC, LexemeC, Morphology, PosTag +from .parts_of_speech cimport univ_pos_t + + +cdef class Tagger: + cdef readonly Pool mem + cdef readonly StringStore strings + cdef readonly Model model + cdef public object lemmatizer + cdef PreshMapArray _morph_cache + cdef public dict freqs + + cdef PosTag* tags + cdef readonly object tag_names + cdef readonly object tag_map + cdef readonly int n_tags + + cdef int predict(self, int i, const TokenC* tokens) except -1 + cdef int update(self, int i, const TokenC* tokens, int gold) except -1 + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx new file mode 100644 index 000000000..ccb40fd22 --- /dev/null +++ b/spacy/tagger.pyx @@ -0,0 +1,144 @@ +import json +from os import path +from collections import defaultdict + +from thinc.typedefs cimport atom_t, weight_t + +from .typedefs cimport attr_t +from .tokens.doc cimport Doc +from .morphology cimport set_morph_from_dict +from .attrs cimport TAG +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE + + +cdef struct _CachedMorph: + Morphology morph + int lemma + + +cdef class Tagger: + """A part-of-speech tagger for English""" + def make_lemmatizer(self): + return None + + def __init__(self, StringStore strings, data_dir): + self.mem = Pool() + model_dir = path.join(data_dir, 'pos') + self.strings = strings + cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) + self.tag_names = sorted(cfg['tag_names']) + assert self.tag_names + self.n_tags = len(self.tag_names) + self.tag_map = cfg['tag_map'] + cdef int n_tags = len(self.tag_names) + 1 + + self.model = Model(n_tags, cfg['templates'], model_dir) + self._morph_cache = PreshMapArray(n_tags) + self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) + for i, tag in enumerate(sorted(self.tag_names)): + pos, props = self.tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + set_morph_from_dict(&self.tags[i].morph, props) + if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): + self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', + 'morphs.json')))) + self.lemmatizer = self.make_lemmatizer(data_dir) + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + def __call__(self, Doc tokens): + """Apply the tagger, setting the POS tags onto the Doc object. + + Args: + tokens (Doc): The tokens to be tagged. + """ + if tokens.length == 0: + return 0 + cdef int i + cdef const weight_t* scores + for i in range(tokens.length): + if tokens.data[i].pos == 0: + guess = self.predict(i, tokens.data) + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) + + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def tag_from_strings(self, Doc tokens, object tag_strs): + cdef int i + for i in range(tokens.length): + tokens.data[i].tag = self.strings[tag_strs[i]] + self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], + tokens.data) + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def train(self, Doc tokens, object gold_tag_strs): + cdef int i + cdef int loss + cdef const weight_t* scores + golds = [self.tag_names.index(g) if g is not None else -1 + for g in gold_tag_strs] + correct = 0 + for i in range(tokens.length): + guess = self.update(i, tokens.data, golds[i]) + loss = golds[i] != -1 and guess != golds[i] + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) + correct += loss == 0 + self.freqs[TAG][tokens.data[i].tag] += 1 + return correct + + cdef int predict(self, int i, const TokenC* tokens) except -1: + raise NotImplementedError + + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: + raise NotImplementedError + + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: + tokens[i].pos = tag.pos + cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) + if cached is NULL: + cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) + cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) + cached.morph = tag.morph + self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) + tokens[i].lemma = cached.lemma + tokens[i].morph = cached.morph + + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: + if self.lemmatizer is None: + return lex.orth + cdef unicode py_string = self.strings[lex.orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return lex.orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma + + def load_morph_exceptions(self, dict exc): + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + orth = self.strings[form_str] + cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) + cached.lemma = self.strings[lemma_str] + set_morph_from_dict(&cached.morph, props) + self._morph_cache.set(pos, orth, cached) From f2f699ac186e6fcd69d79e5cdca87d8a489a3614 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:37:17 +0200 Subject: [PATCH 084/138] * Add language base class --- spacy/language.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 spacy/language.py diff --git a/spacy/language.py b/spacy/language.py new file mode 100644 index 000000000..fca52277b --- /dev/null +++ b/spacy/language.py @@ -0,0 +1,195 @@ +class Language(object): + @staticmethod + def lower(string): + return string.lower() + + @staticmethod + def norm(string): + return string + + @staticmethod + def shape(string): + return orth.word_shape(string) + + @staticmethod + def prefix(string): + return string[0] + + @staticmethod + def suffix(string): + return string[-3:] + + @staticmethod + def prob(string): + return self.oov_prob + + @staticmethod + def cluster(string): + return 0 + + @staticmethod + def is_alpha(string): + return orths.is_alpha(string) + + @staticmethod + def is_lower(string): + return orths.is_lower(string) + + @staticmethod + def is_upper(string): + return orths.is_upper(string) + + @staticmethod + def like_url(string): + return orths.like_url(string) + + @staticmethod + def like_number(string): + return orths.like_number(string) + + @staticmethod + def like_email(string): + return orths.like_email(string) + + def default_lex_attrs(cls, data_dir): + return { + attrs.LOWER: cls.lower, + attrs.NORM: cls.norm, + attrs.SHAPE: cls.shape, + attrs.PREFIX: cls.prefix, + attrs.SUFFIX: cls.suffix, + attrs.CLUSTER: cls.cluster, + attrs.PROB: cls.prob, + + attrs.IS_ALPHA: cls.is_alpha, + attrs.IS_ASCII: cls.is_ascii, + attrs.IS_DIGIT: cls.is_digit, + attrs.IS_LOWER: cls.is_lower, + attrs.IS_UPPER: cls.is_upper, + attrs.LIKE_URL: cls.like_url, + attrs.LIKE_NUM: cls.like_number, + attrs.LIKE_EMAIL: cls.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True + } + + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') + + @classmethod + def default_vocab(cls, get_lex_attr=None, vectors=None, morphology=None, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + if vectors is None: + vectors = cls.default_vectors(data_dir) + if get_lex_attr is None: + get_lex_attr = cls.default_lex_attrs(data_dir) + if morphology is None: + morphology = cls.default_morphology(data_dir) + return vocab = Vocab.from_dir(data_dir, get_lex_attr, vectors, morphology) + + @classmethod + def default_tokenizer(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Tokenizer.from_dir(data_dir, vocab) + + @classmethod + def default_tagger(cls, vocab, data_dir=None): + return Tagger.from_dir(data_dir, vocab) + + @classmethod + def default_parser(cls, vocab, transition_system=None, data_dir=None): + if transition_system is None: + transition_system = ArcEager() + return Parser.from_dir(data_dir, vocab, transition_system) + + @classmethod + def default_entity(cls, vocab, transition_system=None, data_dir=None): + if transition_system is None: + transition_system = BiluoPushDown() + return Parser.from_dir(data_dir, vocab, transition_system) + + @classmethod + def default_matcher(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Matcher(data_dir, vocab) + + @classmethod + def default_serializer(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Packer(data_dir, vocab) + + def __init__(self, vocab=None, tokenizer=None, tagger=None, parser=None, + entity=None, matcher=None, serializer=None): + if data_dir is None: + data_dir = self.default_data_dir() + if vocab is None: + vocab = self.default_vocab(data_dir) + if tokenizer is None: + tokenizer = self.default_tokenizer(vocab, data_dir) + if tagger is None: + tagger = self.default_tagger(vocab, data_dir) + if entity is None: + entity = self.default_entity(vocab, data_dir) + if parser is None: + parser = self.default_parser(vocab, data_dir) + if matcher is None: + matcher = self.default_matcher(vocab, data_dir) + if serializer is None: + serializer = self.default_serializer(vocab, data_dir) + self.vocab = vocab + self.tokenizer = tokenizer + self.tagger = tagger + self.parser = parser + self.entity = entity + self.matcher = matcher + self.serializer = serializer + + def __call__(self, text, tag=True, parse=True, entity=True): + """Apply the pipeline to some text. The text can span multiple sentences, + and can contain arbtrary whitespace. Alignment into the original string + is preserved. + + Args: + text (unicode): The text to be processed. + + Returns: + tokens (spacy.tokens.Doc): + + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp('An example sentence. Another example sentence.') + >>> tokens[0].orth_, tokens[0].head.tag_ + ('An', 'NN') + """ + tokens = self.tokenizer(text) + if self.tagger and tag: + self.tagger(tokens) + if self.matcher and entity: + self.matcher(tokens) + if self.parser and parse: + self.parser(tokens) + if self.entity and entity: + self.entity(tokens) + return tokens + + def end_training(self, data_dir=None): + if data_dir is None: + data_dir = self.data_dir + self.parser.model.end_training() + self.entity.model.end_training() + self.tagger.model.end_training() + self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: + file_.write( + json.dumps([ + (TAG, list(self.tagger.freqs[TAG].items())), + (DEP, list(self.parser.moves.freqs[DEP].items())), + (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), + (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), + (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) From 8083a07c3e5093349180a93314db11c7e15b108e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:37:30 +0200 Subject: [PATCH 085/138] * Use language base class --- spacy/en/__init__.py | 186 ++----------------------------------------- 1 file changed, 7 insertions(+), 179 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 3d433e497..ca19fb084 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,183 +1,11 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function + from os import path -import re -import struct -import json -from .. import orth -from ..vocab import Vocab -from ..tokenizer import Tokenizer -from ..syntax.arc_eager import ArcEager -from ..syntax.ner import BiluoPushDown -from ..syntax.parser import ParserFactory -from ..serialize.bits import BitArray -from ..matcher import Matcher - -from ..tokens import Doc -from ..multi_words import RegexMerger - -from .pos import EnPosTagger -from .pos import POS_TAGS -from .attrs import get_flags -from . import regexes - -from ..util import read_lang_data - -from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB +from ..language import Language -def get_lex_props(string, oov_prob=-30, is_oov=False): - return { - 'flags': get_flags(string, is_oov=is_oov), - 'length': len(string), - 'orth': string, - 'lower': string.lower(), - 'norm': string, - 'shape': orth.word_shape(string), - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': oov_prob, - 'sentiment': 0 - } - -get_lex_attr = {} - -if_model_present = -1 -LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - - -class English(object): - """The English NLP pipeline. - - Example: - - Load data from default directory: - - >>> nlp = English() - >>> nlp = English(data_dir=u'') - - Load data from specified directory: - - >>> nlp = English(data_dir=u'path/to/data_directory') - - Disable (and avoid loading) parts of the processing pipeline: - - >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False) - - Start with nothing loaded: - - >>> nlp = English(data_dir=None) - """ - ParserTransitionSystem = ArcEager - EntityTransitionSystem = BiluoPushDown - - def __init__(self, - data_dir=LOCAL_DATA_DIR, - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=ParserFactory(ParserTransitionSystem), - Entity=ParserFactory(EntityTransitionSystem), - Matcher=Matcher.from_dir, - Packer=None, - load_vectors=True - ): - self.data_dir = data_dir - - if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): - oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read()) - else: - oov_prob = None - - self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, - get_lex_props=get_lex_props, load_vectors=load_vectors, - pos_tags=POS_TAGS, - oov_prob=oov_prob) - if Tagger is True: - Tagger = EnPosTagger - if Parser is True: - transition_system = self.ParserTransitionSystem - Parser = lambda s, d: parser.Parser(s, d, transition_system) - if Entity is True: - transition_system = self.EntityTransitionSystem - Entity = lambda s, d: parser.Parser(s, d, transition_system) - - self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer')) - - if Tagger and path.exists(path.join(data_dir, 'pos')): - self.tagger = Tagger(self.vocab.strings, data_dir) - else: - self.tagger = None - if Parser and path.exists(path.join(data_dir, 'deps')): - self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps')) - else: - self.parser = None - if Entity and path.exists(path.join(data_dir, 'ner')): - self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) - else: - self.entity = None - if Matcher: - self.matcher = Matcher(self.vocab, data_dir) - else: - self.matcher = None - if Packer: - self.packer = Packer(self.vocab, data_dir) - else: - self.packer = None - self.mwe_merger = RegexMerger([ - ('IN', 'O', regexes.MW_PREPOSITIONS_RE), - ('CD', 'TIME', regexes.TIME_RE), - ('NNP', 'DATE', regexes.DAYS_RE), - ('CD', 'MONEY', regexes.MONEY_RE)]) - - def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): - """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string - is preserved. - - Args: - text (unicode): The text to be processed. - - Returns: - tokens (spacy.tokens.Doc): - - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp('An example sentence. Another example sentence.') - >>> tokens[0].orth_, tokens[0].head.tag_ - ('An', 'NN') - """ - tokens = self.tokenizer(text) - if self.tagger and tag: - self.tagger(tokens) - if self.matcher and entity: - self.matcher(tokens) - if self.parser and parse: - self.parser(tokens) - if self.entity and entity: - self.entity(tokens) - if merge_mwes and self.mwe_merger is not None: - self.mwe_merger(tokens) - return tokens - - def end_training(self, data_dir=None): - if data_dir is None: - data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() - self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) - - with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: - file_.write( - json.dumps([ - (TAG, list(self.tagger.freqs[TAG].items())), - (DEP, list(self.parser.moves.freqs[DEP].items())), - (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), - (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), - (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) - - @property - def tags(self): - """Deprecated. List of part-of-speech tag names.""" - return self.tagger.tag_names +class English(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') From 82217c6ec6de0f1f948deb74b6e98e095d45e0da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:46:19 +0200 Subject: [PATCH 086/138] * Generalize lemmatizer --- spacy/en/lemmatizer.py | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py index 5883e12c8..660a16eb9 100644 --- a/spacy/en/lemmatizer.py +++ b/spacy/en/lemmatizer.py @@ -3,39 +3,6 @@ from os import path import codecs -NOUN_RULES = ( - ('s', ''), - ('ses', 's'), - ('ves', 'f'), - ('xes', 'x'), - ('zes', 'z'), - ('ches', 'ch'), - ('shes', 'sh'), - ('men', 'man'), - ('ies', 'y') -) - - -VERB_RULES = ( - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", "") -) - - -ADJ_RULES = ( - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e") -) - - class Lemmatizer(object): def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): self.noun_id = noun_id @@ -48,6 +15,8 @@ class Lemmatizer(object): self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) def __call__(self, string, pos): + + return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) if pos == self.noun_id: return self.noun(string) elif pos == self.verb_id: @@ -58,13 +27,13 @@ class Lemmatizer(object): raise Exception("Cannot lemmatize with unknown pos: %s" % pos) def noun(self, string): - return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) + return self(string, 'noun') def verb(self, string): - return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) + return self(string, 'verb') def adj(self, string): - return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) + return self(string, 'adj') def lemmatize(string, index, exceptions, rules): From c5a27d1821d2bccdeec75c8740a442f74d66358d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:47:08 +0200 Subject: [PATCH 087/138] * Move lemmatizer to spacy --- spacy/{en => }/lemmatizer.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/{en => }/lemmatizer.py (100%) diff --git a/spacy/en/lemmatizer.py b/spacy/lemmatizer.py similarity index 100% rename from spacy/en/lemmatizer.py rename to spacy/lemmatizer.py From 494da25872d6250068029df9a843baebb85a1a02 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:13:50 +0200 Subject: [PATCH 088/138] * Refactor for more universal spacy --- lang_data/de/infix.txt | 3 + lang_data/de/morphs.json | 0 lang_data/de/prefix.txt | 21 ++++++ lang_data/de/sample.txt | 3 + lang_data/de/specials.json | 149 +++++++++++++++++++++++++++++++++++++ lang_data/de/suffix.txt | 26 +++++++ lang_data/de/tag_map.json | 56 ++++++++++++++ lang_data/en/tag_map.json | 51 +++++++++++++ 8 files changed, 309 insertions(+) create mode 100644 lang_data/de/infix.txt create mode 100644 lang_data/de/morphs.json create mode 100644 lang_data/de/prefix.txt create mode 100644 lang_data/de/sample.txt create mode 100644 lang_data/de/specials.json create mode 100644 lang_data/de/suffix.txt create mode 100644 lang_data/de/tag_map.json create mode 100644 lang_data/en/tag_map.json diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/de/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/de/morphs.json b/lang_data/de/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/de/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/de/sample.txt b/lang_data/de/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/de/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/de/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/de/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/de/tag_map.json b/lang_data/de/tag_map.json new file mode 100644 index 000000000..ee1bb1b81 --- /dev/null +++ b/lang_data/de/tag_map.json @@ -0,0 +1,56 @@ +{ +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"} +} diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json new file mode 100644 index 000000000..57d3eedee --- /dev/null +++ b/lang_data/en/tag_map.json @@ -0,0 +1,51 @@ +{ +".": {"pos": "punc", "punctype": "peri"}, +",": {"pos": "punc", "punctype": "comm"}, +"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"}, +"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"}, +"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"}, +"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"}, +":": {"pos": "punc"}, +"$": {"pos": "sym", "other": {"symtype": "currency"}}, +"#": {"pos": "sym", "other": {"symtype": "numbersign"}}, +"AFX": {"pos": "adj", "hyph": "hyph"}, +"CC": {"pos": "conj", "conjtype": "coor"}, +"CD": {"pos": "num", "numtype": "card"}, +"DT": {"pos": "adj", "prontype": "prn"}, +"EX": {"pos": "adv", "advtype": "ex"}, +"FW": {"foreign": "foreign"}, +"HYPH": {"pos": "punc", "punctype": "dash"}, +"IN": {"pos": "adp"}, +"JJ": {"pos": "adj", "degree": "pos"}, +"JJR": {"pos": "adj", "degree": "comp"}, +"JJS": {"pos": "adj", "degree": "sup"}, +"LS": {"pos": "punc", "numtype": "ord"}, +"MD": {"pos": "verb", "verbtype": "mod"}, +"NIL": {}, +"NN": {"pos": "noun", "number": "sing"}, +"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, +"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, +"NNS": {"pos": "noun", "number": "plur"}, +"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"}, +"POS": {"pos": "part", "poss": "poss"}, +"PRP": {"pos": "noun", "prontype": "prs"}, +"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"}, +"RB": {"pos": "adv", "degree": "pos"}, +"RBR": {"pos": "adv", "degree": "comp"}, +"RBS": {"pos": "adv", "degree": "sup"}, +"RP": {"pos": "part"}, +"SYM": {"pos": "sym"}, +"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, +"UH": {"pos": "int"}, +"VB": {"pos": "verb", "verbform": "inf"}, +"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, +"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, +"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"}, +"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"}, +"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3}, +"WDT": {"pos": "adj", "prontype": "int|rel"}, +"WP": {"pos": "noun", "prontype": "int|rel"}, +"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, +"WRB": {"pos": "adv", "prontype": "int|rel"}, +"SP": {"pos": "space"} +} From dc13edd7cb78e751d0954059173c09bc0ebf7394 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:14:05 +0200 Subject: [PATCH 089/138] * Refactor init_model to accomodate other languages --- bin/init_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 9a635f296..0badf71fc 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -151,7 +151,7 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_attr=get_lex_attr) + vocab = Vocab(get_lex_attr=get_lex_attr) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: @@ -183,8 +183,8 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { - 'en': spacy.en.get_lex_attr, - 'de': spacy.en.get_lex_attr + 'en': spacy.en.English.default_lex_attrs(), + 'de': spacy.de.Deutsch.default_lex_attrs() } model_dir = Path(model_dir) From c2d8edd0bdae9a6a2a0ac59e8ee37a09524d7674 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:14:19 +0200 Subject: [PATCH 090/138] * Add PROB attribute in attrs.pxd --- spacy/attrs.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d2ace1cff..c810762ef 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -84,3 +84,4 @@ cpdef enum attr_id_t: ENT_TYPE HEAD SPACY + PROB From c4d87543857c6f40e521c8ea93b0c5ebf920e565 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:15:07 +0200 Subject: [PATCH 091/138] * Specify LOCAL_DATA_DIR global in spacy.en.__init__.py --- spacy/en/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index ca19fb084..f68ff196e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -4,8 +4,9 @@ from os import path from ..language import Language +LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') class English(Language): @classmethod def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + return LOCAL_DATA_DIR From e2ef78b29cee72790d0bf20983d64fd6be32c7da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:15:42 +0200 Subject: [PATCH 092/138] * Gut pos.pyx module, since functionality moved to spacy/tagger.pyx --- spacy/en/pos.pyx | 261 +---------------------------------------------- 1 file changed, 2 insertions(+), 259 deletions(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 703d7198c..8e034eadf 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -1,268 +1,11 @@ from os import path -import json -import os -import shutil -from libc.string cimport memset +from ..parts_of_speech cimport NOUN, VERB, ADJ -from cymem.cymem cimport Address -from thinc.typedefs cimport atom_t, weight_t -from collections import defaultdict - -from ..parts_of_speech cimport univ_pos_t -from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON - -from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE -from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens.doc cimport Doc -from ..morphology cimport set_morph_from_dict -from .._ml cimport arg_max - -from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL -from ..typedefs cimport attr_t - -from .lemmatizer import Lemmatizer - - -cpdef enum en_person_t: - NO_PERSON - FIRST - SECOND - THIRD - NON_THIRD - - -cpdef enum en_number_t: - NO_NUMBER - SINGULAR - PLURAL - MASS - - -cpdef enum en_gender_t: - NO_GENDER - MASCULINE - FEMININE - NEUTER - - -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - GENITIVE - ACCUSATIVE - REFLEXIVE - DEMONYM - - -cpdef enum en_tenspect_t: - NO_TENSE - BASE_VERB - PRESENT - PAST - PASSIVE - ING - MODAL - - -cpdef enum misc_t: - NO_MISC - COMPARATIVE - SUPERLATIVE - RELATIVE - NAME - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -POS_TAGS = { - 'NULL': (NO_TAG, {}), - 'EOL': (EOL, {}), - 'CC': (CONJ, {}), - 'CD': (NUM, {}), - 'DT': (DET, {}), - 'EX': (DET, {}), - 'FW': (X, {}), - 'IN': (ADP, {}), - 'JJ': (ADJ, {}), - 'JJR': (ADJ, {'misc': COMPARATIVE}), - 'JJS': (ADJ, {'misc': SUPERLATIVE}), - 'LS': (X, {}), - 'MD': (VERB, {'tenspect': MODAL}), - 'NN': (NOUN, {}), - 'NNS': (NOUN, {'number': PLURAL}), - 'NNP': (NOUN, {'misc': NAME}), - 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), - 'PDT': (DET, {}), - 'POS': (PRT, {'case': GENITIVE}), - 'PRP': (PRON, {}), - 'PRP$': (PRON, {'case': GENITIVE}), - 'RB': (ADV, {}), - 'RBR': (ADV, {'misc': COMPARATIVE}), - 'RBS': (ADV, {'misc': SUPERLATIVE}), - 'RP': (PRT, {}), - 'SYM': (X, {}), - 'TO': (PRT, {}), - 'UH': (X, {}), - 'VB': (VERB, {}), - 'VBD': (VERB, {'tenspect': PAST}), - 'VBG': (VERB, {'tenspect': ING}), - 'VBN': (VERB, {'tenspect': PASSIVE}), - 'VBP': (VERB, {'tenspect': PRESENT}), - 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), - 'WDT': (DET, {'misc': RELATIVE}), - 'WP': (PRON, {'misc': RELATIVE}), - 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), - 'WRB': (ADV, {'misc': RELATIVE}), - '!': (PUNCT, {}), - '#': (PUNCT, {}), - '$': (PUNCT, {}), - "''": (PUNCT, {}), - "(": (PUNCT, {}), - ")": (PUNCT, {}), - "-LRB-": (PUNCT, {}), - "-RRB-": (PUNCT, {}), - ".": (PUNCT, {}), - ",": (PUNCT, {}), - "``": (PUNCT, {}), - ":": (PUNCT, {}), - "?": (PUNCT, {}), - "ADD": (X, {}), - "NFP": (PUNCT, {}), - "GW": (X, {}), - "AFX": (X, {}), - "HYPH": (PUNCT, {}), - "XX": (X, {}), - "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "SP": (SPACE, {}) -} - - -POS_TEMPLATES = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), -) +from ..lemmatizer import Lemmatizer cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" def make_lemmatizer(self, data_dir): return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - - cdef int predict(self, int i, const TokenC* tokens) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - return arg_max(scores, self.model.n_classes) - - cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != gold if gold != -1 else 0 - self.model.update(context, guess, gold, loss) - return guess - - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 From 76996f414515765709499c642199892f160d244d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:16:09 +0200 Subject: [PATCH 093/138] * Hack on generic Language class. Still needs work for morphology, defaults, etc --- spacy/language.py | 153 +++++++++++++++++++++++++++++++++------------- 1 file changed, 109 insertions(+), 44 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index fca52277b..706df34a5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,3 +1,19 @@ +from os import path + +from .tokenizer import Tokenizer +from .morphology import Morphology +from .vocab import Vocab +from .syntax.parser import Parser +from .tagger import Tagger +from .matcher import Matcher +from .serialize.packer import Packer +from ._ml import Model +from . import attrs +from . import orth +from .syntax.ner import BiluoPushDown +from .syntax.arc_eager import ArcEager + + class Language(object): @staticmethod def lower(string): @@ -21,7 +37,7 @@ class Language(object): @staticmethod def prob(string): - return self.oov_prob + return -30 @staticmethod def cluster(string): @@ -29,29 +45,50 @@ class Language(object): @staticmethod def is_alpha(string): - return orths.is_alpha(string) + return orth.is_alpha(string) + + @staticmethod + def is_ascii(string): + return orth.is_ascii(string) + + @staticmethod + def is_digit(string): + return string.isdigit() @staticmethod def is_lower(string): - return orths.is_lower(string) + return orth.is_lower(string) + + @staticmethod + def is_punct(string): + return orth.is_punct(string) + + @staticmethod + def is_space(string): + return string.isspace() + + @staticmethod + def is_title(string): + return orth.is_title(string) @staticmethod def is_upper(string): - return orths.is_upper(string) + return orth.is_upper(string) @staticmethod def like_url(string): - return orths.like_url(string) + return orth.like_url(string) @staticmethod def like_number(string): - return orths.like_number(string) + return orth.like_number(string) @staticmethod def like_email(string): - return orths.like_email(string) + return orth.like_email(string) - def default_lex_attrs(cls, data_dir): + @classmethod + def default_lex_attrs(cls, data_dir=None): return { attrs.LOWER: cls.lower, attrs.NORM: cls.norm, @@ -59,12 +96,15 @@ class Language(object): attrs.PREFIX: cls.prefix, attrs.SUFFIX: cls.suffix, attrs.CLUSTER: cls.cluster, - attrs.PROB: cls.prob, + attrs.PROB: lambda string: -10.0, attrs.IS_ALPHA: cls.is_alpha, attrs.IS_ASCII: cls.is_ascii, attrs.IS_DIGIT: cls.is_digit, attrs.IS_LOWER: cls.is_lower, + attrs.IS_PUNCT: cls.is_punct, + attrs.IS_SPACE: cls.is_space, + attrs.IS_TITLE: cls.is_title, attrs.IS_UPPER: cls.is_upper, attrs.LIKE_URL: cls.like_url, attrs.LIKE_NUM: cls.like_number, @@ -73,12 +113,36 @@ class Language(object): attrs.IS_OOV: lambda string: True } + @classmethod + def default_dep_templates(cls): + return [] + + @classmethod + def default_ner_templates(cls): + return [] + + @classmethod + def default_dep_labels(cls): + return {0: {'ROOT': True}} + + @classmethod + def default_ner_labels(cls): + return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} + @classmethod def default_data_dir(cls): return path.join(path.dirname(__file__), 'data') @classmethod - def default_vocab(cls, get_lex_attr=None, vectors=None, morphology=None, data_dir=None): + def default_morphology(cls, data_dir): + return Morphology.from_dir(data_dir) + + @classmethod + def default_vectors(cls, data_dir): + return None + + @classmethod + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None): if data_dir is None: data_dir = cls.default_data_dir() if vectors is None: @@ -86,70 +150,71 @@ class Language(object): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs(data_dir) if morphology is None: - morphology = cls.default_morphology(data_dir) - return vocab = Vocab.from_dir(data_dir, get_lex_attr, vectors, morphology) + morphology = cls.default_morphology(path.join(data_dir, 'vocab')) + return Vocab.from_dir( + path.join(data_dir, 'vocab'), + get_lex_attr=get_lex_attr, + vectors=vectors, + morphology=morphology) @classmethod - def default_tokenizer(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Tokenizer.from_dir(data_dir, vocab) + def default_tokenizer(cls, vocab, data_dir): + if path.exists(data_dir): + return Tokenizer.from_dir(vocab, data_dir) + else: + return Tokenizer(vocab, {}, None, None, None) @classmethod - def default_tagger(cls, vocab, data_dir=None): - return Tagger.from_dir(data_dir, vocab) + def default_tagger(cls, vocab, data_dir): + if path.exists(data_dir): + return Tagger.from_dir(data_dir, vocab) + else: + return None @classmethod - def default_parser(cls, vocab, transition_system=None, data_dir=None): - if transition_system is None: - transition_system = ArcEager() - return Parser.from_dir(data_dir, vocab, transition_system) + def default_parser(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, ArcEager) + else: + return None @classmethod - def default_entity(cls, vocab, transition_system=None, data_dir=None): - if transition_system is None: - transition_system = BiluoPushDown() - return Parser.from_dir(data_dir, vocab, transition_system) + def default_entity(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) + else: + return None @classmethod def default_matcher(cls, vocab, data_dir=None): if data_dir is None: data_dir = cls.default_data_dir() - return Matcher(data_dir, vocab) + return Matcher.from_dir(data_dir, vocab) - @classmethod - def default_serializer(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Packer(data_dir, vocab) - - def __init__(self, vocab=None, tokenizer=None, tagger=None, parser=None, - entity=None, matcher=None, serializer=None): + def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + parser=None, entity=None, matcher=None, serializer=None): if data_dir is None: data_dir = self.default_data_dir() if vocab is None: vocab = self.default_vocab(data_dir) if tokenizer is None: - tokenizer = self.default_tokenizer(vocab, data_dir) + tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) if tagger is None: - tagger = self.default_tagger(vocab, data_dir) + tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) if entity is None: - entity = self.default_entity(vocab, data_dir) + entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) if parser is None: - parser = self.default_parser(vocab, data_dir) + parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) if matcher is None: - matcher = self.default_matcher(vocab, data_dir) - if serializer is None: - serializer = self.default_serializer(vocab, data_dir) + matcher = self.default_matcher(vocab, data_dir=data_dir) self.vocab = vocab self.tokenizer = tokenizer self.tagger = tagger self.parser = parser self.entity = entity self.matcher = matcher - self.serializer = serializer - def __call__(self, text, tag=True, parse=True, entity=True): + def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. From 3acf60df06d2f1bf2afed1049ff87f7402c6b285 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:16:28 +0200 Subject: [PATCH 094/138] * Add missing properties in Lexeme class --- spacy/lexeme.pyx | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4deec60c1..e99bcfa7c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -29,6 +29,10 @@ cdef class Lexeme: self.c = vocab.get_by_orth(vocab.mem, orth) assert self.c.orth == orth + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] + property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -49,9 +53,13 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - property orth_: - def __get__(self): - return self.vocab.strings[self.c.orth] + property cluster: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property prob: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x property lower_: def __get__(self): return self.vocab.strings[self.c.lower] @@ -73,6 +81,10 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + property flags: + def __get__(self): return self.c.flags + def __set__(self, flags_t x): self.c.flags = x + property is_oov: def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) From 430affc347423c8312130e5963da93fd471ff3dc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:02 +0200 Subject: [PATCH 095/138] * Fix missing n_patterns property in Matcher class. Fix from_dir method --- spacy/matcher.pyx | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 9d1220648..2cc91a368 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -99,7 +99,7 @@ def map_attr_name(attr): cdef class Matcher: cdef Pool mem cdef vector[Pattern*] patterns - cdef readonly int n_patterns + cdef readonly Vocab vocab def __init__(self, vocab, patterns): self.mem = Pool() @@ -107,6 +107,19 @@ cdef class Matcher: for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) + @classmethod + def from_dir(cls, data_dir, Vocab vocab): + patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') + if path.exists(patterns_loc): + patterns_data = open(patterns_loc).read() + patterns = json.loads(patterns_data) + return cls(vocab, patterns) + else: + return cls(vocab, {}) + + property n_patterns: + def __get__(self): return self.patterns.size() + def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): entity_key = self.vocab.strings[entity_key] @@ -120,16 +133,6 @@ cdef class Matcher: spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) - @classmethod - def from_dir(cls, vocab, data_dir): - patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') - if path.exists(patterns_loc): - patterns_data = open(patterns_loc).read() - patterns = json.loads(patterns_data) - return cls(vocab, patterns) - else: - return cls(vocab, {}) - def __call__(self, Doc doc): cdef vector[Pattern*] partials cdef int n_partials = 0 From 378729f81af5025f6f45e68a95ca7f5eef24a1a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:21 +0200 Subject: [PATCH 096/138] * Hack Morphology class towards usability --- spacy/morphology.pyx | 136 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 96a4ba884..f32009351 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,11 +1,129 @@ -# cython: embedsignature=True +from os import path + +try: + import ujson as json +except ImportError: + import json + +from spacy.parts_of_speech import UNIV_POS_NAMES + + +cdef class Morphology: + def __init__(self, tag_map, fused_tokens, lemmatizer): + self.tag_map = tag_map + self.n_tags = len(tag_map) + self.tag_names = tuple(sorted(tag_map.keys())) + self.tag_ids = {} + for i, tag_str in enumerate(self.tag_names): + self.tag_ids[tag_str] = i + + @classmethod + def from_dir(cls, data_dir): + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + return cls(tag_map, {}, None) + + cdef int assign_tag(self, TokenC* token, int tag) except -1: + props = self.tag_map[self.tag_names[tag]] + token.pos = UNIV_POS_NAMES[props['pos'].upper()] + token.tag = tag + #token.inflection = # TODO + + cdef int assign_from_dict(self, TokenC* token, props) except -1: + pass + + def load_morph_exceptions(self, dict exc): + pass + # Map (form, pos) to (lemma, inflection) + #cdef unicode pos_str + #cdef unicode form_str + #cdef unicode lemma_str + #cdef dict entries + #cdef dict props + #cdef int lemma + #cdef attr_t orth + #cdef int pos + #for pos_str, entries in exc.items(): + # pos = self.tag_names.index(pos_str) + # for form_str, props in entries.items(): + # lemma_str = props.get('L', form_str) + # orth = self.strings[form_str] + # cached = self.mem.alloc(1, sizeof(InflectedLemma)) + # cached.lemma = self.strings[lemma_str] + # set_morph_from_dict(&cached.morph, props) + # self._morph_cache.set(pos, orth, cached) -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: - morph.number = props.get('number', 0) - morph.tenspect = props.get('tenspect', 0) - morph.mood = props.get('mood', 0) - morph.gender = props.get('gender', 0) - morph.person = props.get('person', 0) - morph.case = props.get('case', 0) - morph.misc = props.get('misc', 0) +#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: +# morph.number = props.get('number', 0) +# morph.tenspect = props.get('tenspect', 0) +# morph.mood = props.get('mood', 0) +# morph.gender = props.get('gender', 0) +# morph.person = props.get('person', 0) +# morph.case = props.get('case', 0) +# morph.misc = props.get('misc', 0) +# +# +#cdef class Morphology: +# cdef Pool mem +# cdef PreshMap table +# +# def __init__(self, tags, exceptions): +# pass +# +# def __getitem__(self, hash_t id_): +# pass +# +# cdef const InflectionC* get(self, hash_t key) except NULL: +# pass +# +# cdef MorphAnalysis analyse(const TokenC* token) except -1: +# cdef struct MorphAnalysis morphology +# tokens[i].pos = tag.pos +# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) +# if cached is NULL: +# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) +# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) +# cached.morph = tag.morph +# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) +# tokens[i].lemma = cached.lemma +# tokens[i].morph = cached.morph +# +# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: +# if self.lemmatizer is None: +# return lex.orth +# cdef unicode py_string = self.strings[lex.orth] +# if pos != NOUN and pos != VERB and pos != ADJ: +# return lex.orth +# cdef set lemma_strings +# cdef unicode lemma_string +# lemma_strings = self.lemmatizer(py_string, pos) +# lemma_string = sorted(lemma_strings)[0] +# lemma = self.strings[lemma_string] +# return lemma +# +# +#cdef class Inflection: +# cdef InflectionC* c +# +# def __init__(self, container, id_): +# self.c = container[id_] +# self.container = container +# +# for i, feat_id in enumerate(feat_ids): +# feature, value = parse_id(feat_id) +# self.add_value(feature, value, True) +# +# def has(self, Value_t feat_value_id): +# part = feat_value_id % 64 +# bit = feat_value_id / 64 +# if self.value_set[part] & bit: +# return True +# else: +# return False +# +# property pos: def __get__(self): return self.c.pos +# +# property id: def __get__(self): return self.c.id +# +# property features: +# pass From 008b02b03572c1687bc9e9a004adef26920abb5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:35 +0200 Subject: [PATCH 097/138] * Comment out enums in Morpohlogy for now --- spacy/morphology.pxd | 982 ++++++++++++++++++++++--------------------- 1 file changed, 493 insertions(+), 489 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 6914eb8d6..7f2ebe34b 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,478 +1,487 @@ -from .structs cimport TokenC, Morphology, PosTag - - -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 - - -cdef enum Feature: - Abbr - AdpType - AdvType - ConjType - Connegative - Derivation - Echo - Foreign - Gender_dat - Gender_erg - Gender_psor - Hyph - InfForm - NameType - NounType - NumberAbs - NumberDat - NumberErg - NumberPsee - NumberPsor - NumForm - NumValue - PartForm - PartType - Person_abs - Person_dat - Person_psor - Polite - Polite_abs - Polite_dat - Prefix - PrepCase - PunctSide - PunctType - Style - Typo - Variant - VerbType -cpdef enum Animacy: - Anim - Inam - - -cpdef enum Aspect: - Freq - Imp - Mod - None_ - Perf - - -cpdef enum Case1: - Abe - Abl - Abs - Acc - Ade - All - Cau - Com - -cdef enum Case2: - Dat - Del - Dis - Ela - Ess - Gen - Ill - Ine - -cdef enum Case3: - Ins - Loc - Lat - Nom - Par - Sub - Sup - Tem - Ter - - -cdef enum Case4: - Tra - Voc - - -cpdef enum Definite: - Two - Def - Red - Ind - - -cpdef enum Degree: - Cmp - Comp - None_ - Pos - Sup - Abs - Com - Degree # du - - -cpdef enum Gender: - Com - Fem - Masc - Neut - - -cpdef enum Mood: - Cnd - Imp - Ind - N - Pot - Sub - Opt - - -cpdef enum Negative: - Neg - Pos - Yes - - -cpdef enum Number: - Com - Dual - None_ - Plur - Sing - Ptan # bg - Count # bg - - -cpdef enum NumType: - Card - Dist - Frac - Gen - Mult - None_ - Ord - Sets - - -cpdef enum Person: - One - Two - Three - None_ - - -cpdef enum Poss: - Yes - - -cpdef enum PronType1: - AdvPart - Art - Default - Dem - Ind - Int - Neg - -cpdef enum PronType2: - Prs - Rcp - Rel - Tot - Clit - Exc # es, ca, it, fa - Clit # it - - -cpdef enum Reflex: - Yes - - -cpdef enum Tense: - Fut - Imp - Past - Pres - -cpdef enum VerbForm1: - Fin - Ger - Inf - None_ - Part - PartFut - PartPast - -cpdef enum VerbForm2: - PartPres - Sup - Trans - Gdv # la - - -cpdef enum Voice: - Act - Cau - Pass - Mid # gkc - Int # hb - - -cpdef enum Abbr: - Yes # cz, fi, sl, U - -cpdef enum AdpType: - Prep # cz, U - Post # U - Voc # cz - Comprep # cz - Circ # U - Voc # U - - -cpdef enum AdvType1: - # U - Man - Loc - Tim - Deg - Cau - Mod - Sta - Ex - -cpdef enum AdvType2: - Adadj - -cpdef enum ConjType: - Oper # cz, U - Comp # cz, U - -cpdef enum Connegative: - Yes # fi - - -cpdef enum Derivation1: - Minen # fi - Sti # fi - Inen # fi - Lainen # fi - Ja # fi - Ton # fi - Vs # fi - Ttain # fi - -cpdef enum Derivation2: - Ttaa - - -cpdef enum Echo: - Rdp # U - Ech # U - - -cpdef enum Foreign: - Foreign # cz, fi, U - Fscript # cz, fi, U - Tscript # cz, U - Yes # sl - - -cpdef enum Gender_dat: - Masc # bq, U - Fem # bq, U - - -cpdef enum Gender_erg: - Masc # bq - Fem # bq - - -cpdef enum Gender_psor: - Masc # cz, sl, U - Fem # cz, sl, U - Neut # sl - - -cpdef enum Hyph: - Yes # cz, U - - -cpdef enum InfForm: - One # fi - Two # fi - Three # fi - - -cpdef enum NameType: - Geo # U, cz - Prs # U, cz - Giv # U, cz - Sur # U, cz - Nat # U, cz - Com # U, cz - Pro # U, cz - Oth # U, cz - - -cpdef enum NounType: - Com # U - Prop # U - Class # U - -cpdef enum Number_abs: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_dat: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_erg: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_psee: - Sing # U - Plur # U - - -cpdef enum Number_psor: - Sing # cz, fi, sl, U - Plur # cz, fi, sl, U - - -cpdef enum NumForm: - Digit # cz, sl, U - Roman # cz, sl, U - Word # cz, sl, U - - -cpdef enum NumValue: - One # cz, U - Two # cz, U - Three # cz, U - - -cpdef enum PartForm: - Pres # fi - Past # fi - Agt # fi - Neg # fi - - -cpdef enum PartType: - Mod # U - Emp # U - Res # U - Inf # U - Vbp # U - -cpdef enum Person_abs: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_dat: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_erg: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_psor: - One # fi, U - Two # fi, U - Three # fi, U - - -cpdef enum Polite: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_abs: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_erg: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_dat: - Inf # bq, U - Pol # bq, U - - -cpdef enum Prefix: - Yes # U - - -cpdef enum PrepCase: - Npr # cz - Pre # U - - -cpdef enum PunctSide: - Ini # U - Fin # U - -cpdef enum PunctType1: - Peri # U - Qest # U - Excl # U - Quot # U - Brck # U - Comm # U - Colo # U - Semi # U - -cpdef enum PunctType2: - Dash # U - - -cpdef enum Style1: - Arch # cz, fi, U - Rare # cz, fi, U - Poet # cz, U - Norm # cz, U - Coll # cz, U - Vrnc # cz, U - Sing # cz, U - Expr # cz, U - - -cpdef enum Style2: - Derg # cz, U - Vulg # cz, U - - -cpdef enum Typo: - Yes # fi, U - - -cpdef enum Variant: - Short # cz - Bound # cz, sl - - -cpdef enum VerbType: - Aux # U - Cop # U - Mod # U - Light # U - - -cpdef enum FeatureValues: +from .structs cimport TokenC + + +cdef class Morphology: + cdef public object tag_map + cdef public object tag_names + cdef public object tag_ids + cdef public int n_tags + + cdef int assign_tag(self, TokenC* token, int tag) except -1 + + cdef int assign_from_dict(self, TokenC* token, props) except -1 + +# +#cpdef enum Feature_t: +# Abbr +# AdpType +# AdvType +# ConjType +# Connegative +# Derivation +# Echo +# Foreign +# Gender_dat +# Gender_erg +# Gender_psor +# Hyph +# InfForm +# NameType +# NounType +# NumberAbs +# NumberDat +# NumberErg +# NumberPsee +# NumberPsor +# NumForm +# NumValue +# PartForm +# PartType +# Person_abs +# Person_dat +# Person_psor +# Polite +# Polite_abs +# Polite_dat +# Prefix +# PrepCase +# PunctSide +# PunctType +# Style +# Typo +# Variant +# VerbType +# +# +#cpdef enum Animacy: +# Anim +# Inam +# +# +#cpdef enum Aspect: +# Freq +# Imp +# Mod +# None_ +# Perf +# +# +#cpdef enum Case1: +# Nom +# Gen +# Acc +# Dat +# Voc +# Abl +# +#cdef enum Case2: +# Abe +# Abs +# Ade +# All +# Cau +# Com +# Del +# Dis +# +#cdef enum Case3: +# Ela +# Ess +# Ill +# Ine +# Ins +# Loc +# Lat +# Par +# +#cdef enum Case4: +# Sub +# Sup +# Tem +# Ter +# Tra +# +# +#cpdef enum Definite: +# Two +# Def +# Red +# Ind +# +# +#cpdef enum Degree: +# Cmp +# Comp +# None_ +# Pos +# Sup +# Abs +# Com +# Degree # du +# +# +#cpdef enum Gender: +# Com +# Fem +# Masc +# Neut +# +# +#cpdef enum Mood: +# Cnd +# Imp +# Ind +# N +# Pot +# Sub +# Opt +# +# +#cpdef enum Negative: +# Neg +# Pos +# Yes +# +# +#cpdef enum Number: +# Com +# Dual +# None_ +# Plur +# Sing +# Ptan # bg +# Count # bg +# +# +#cpdef enum NumType: +# Card +# Dist +# Frac +# Gen +# Mult +# None_ +# Ord +# Sets +# +# +#cpdef enum Person: +# One +# Two +# Three +# None_ +# +# +#cpdef enum Poss: +# Yes +# +# +#cpdef enum PronType1: +# AdvPart +# Art +# Default +# Dem +# Ind +# Int +# Neg +# +#cpdef enum PronType2: +# Prs +# Rcp +# Rel +# Tot +# Clit +# Exc # es, ca, it, fa +# Clit # it +# +# +#cpdef enum Reflex: +# Yes +# +# +#cpdef enum Tense: +# Fut +# Imp +# Past +# Pres +# +#cpdef enum VerbForm1: +# Fin +# Ger +# Inf +# None_ +# Part +# PartFut +# PartPast +# +#cpdef enum VerbForm2: +# PartPres +# Sup +# Trans +# Gdv # la +# +# +#cpdef enum Voice: +# Act +# Cau +# Pass +# Mid # gkc +# Int # hb +# +# +#cpdef enum Abbr: +# Yes # cz, fi, sl, U +# +#cpdef enum AdpType: +# Prep # cz, U +# Post # U +# Voc # cz +# Comprep # cz +# Circ # U +# Voc # U +# +# +#cpdef enum AdvType1: +# # U +# Man +# Loc +# Tim +# Deg +# Cau +# Mod +# Sta +# Ex +# +#cpdef enum AdvType2: +# Adadj +# +#cpdef enum ConjType: +# Oper # cz, U +# Comp # cz, U +# +#cpdef enum Connegative: +# Yes # fi +# +# +#cpdef enum Derivation1: +# Minen # fi +# Sti # fi +# Inen # fi +# Lainen # fi +# Ja # fi +# Ton # fi +# Vs # fi +# Ttain # fi +# +#cpdef enum Derivation2: +# Ttaa +# +# +#cpdef enum Echo: +# Rdp # U +# Ech # U +# +# +#cpdef enum Foreign: +# Foreign # cz, fi, U +# Fscript # cz, fi, U +# Tscript # cz, U +# Yes # sl +# +# +#cpdef enum Gender_dat: +# Masc # bq, U +# Fem # bq, U +# +# +#cpdef enum Gender_erg: +# Masc # bq +# Fem # bq +# +# +#cpdef enum Gender_psor: +# Masc # cz, sl, U +# Fem # cz, sl, U +# Neut # sl +# +# +#cpdef enum Hyph: +# Yes # cz, U +# +# +#cpdef enum InfForm: +# One # fi +# Two # fi +# Three # fi +# +# +#cpdef enum NameType: +# Geo # U, cz +# Prs # U, cz +# Giv # U, cz +# Sur # U, cz +# Nat # U, cz +# Com # U, cz +# Pro # U, cz +# Oth # U, cz +# +# +#cpdef enum NounType: +# Com # U +# Prop # U +# Class # U +# +#cpdef enum Number_abs: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_dat: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_erg: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_psee: +# Sing # U +# Plur # U +# +# +#cpdef enum Number_psor: +# Sing # cz, fi, sl, U +# Plur # cz, fi, sl, U +# +# +#cpdef enum NumForm: +# Digit # cz, sl, U +# Roman # cz, sl, U +# Word # cz, sl, U +# +# +#cpdef enum NumValue: +# One # cz, U +# Two # cz, U +# Three # cz, U +# +# +#cpdef enum PartForm: +# Pres # fi +# Past # fi +# Agt # fi +# Neg # fi +# +# +#cpdef enum PartType: +# Mod # U +# Emp # U +# Res # U +# Inf # U +# Vbp # U +# +#cpdef enum Person_abs: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_dat: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_erg: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_psor: +# One # fi, U +# Two # fi, U +# Three # fi, U +# +# +#cpdef enum Polite: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_abs: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_erg: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_dat: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Prefix: +# Yes # U +# +# +#cpdef enum PrepCase: +# Npr # cz +# Pre # U +# +# +#cpdef enum PunctSide: +# Ini # U +# Fin # U +# +#cpdef enum PunctType1: +# Peri # U +# Qest # U +# Excl # U +# Quot # U +# Brck # U +# Comm # U +# Colo # U +# Semi # U +# +#cpdef enum PunctType2: +# Dash # U +# +# +#cpdef enum Style1: +# Arch # cz, fi, U +# Rare # cz, fi, U +# Poet # cz, U +# Norm # cz, U +# Coll # cz, U +# Vrnc # cz, U +# Sing # cz, U +# Expr # cz, U +# +# +#cpdef enum Style2: +# Derg # cz, U +# Vulg # cz, U +# +# +#cpdef enum Typo: +# Yes # fi, U +# +# +#cpdef enum Variant: +# Short # cz +# Bound # cz, sl +# +# +#cpdef enum VerbType: +# Aux # U +# Cop # U +# Mod # U +# Light # U +# + +cpdef enum Value_t: Animacy_Anim Animacy_Inam Aspect_Freq @@ -566,7 +575,6 @@ cpdef enum FeatureValues: PronType_Tot PronType_Clit PronType_Exc # es, ca, it, fa - PronType_Clit # it Reflex_Yes Tense_Fut Tense_Imp @@ -594,7 +602,6 @@ cpdef enum FeatureValues: AdpType_Voc # cz AdpType_Comprep # cz AdpType_Circ # U - AdpType_Voc # U AdvType_Man AdvType_Loc AdvType_Tim @@ -607,16 +614,15 @@ cpdef enum FeatureValues: ConjType_Oper # cz, U ConjType_Comp # cz, U Connegative_Yes # fi - # fi - Derivation_Minen - Derivation_Sti - Derivation_Inen - Derivation_Lainen - Derivation_Ja - Derivation_Ton - Derivation_Vs - Derivation_Ttain - Derivation_Ttaa + Derivation_Minen # fi + Derivation_Sti # fi + Derivation_Inen # fi + Derivation_Lainen # fi + Derivation_Ja # fi + Derivation_Ton # fi + Derivation_Vs # fi + Derivation_Ttain # fi + Derivation_Ttaa # fi Echo_Rdp # U Echo_Ech # U Foreign_Foreign # cz, fi, U @@ -721,5 +727,3 @@ cpdef enum FeatureValues: VerbType_Cop # U VerbType_Mod # U VerbType_Light # U - - From f8f2f4e545f752e25160dae93691828a01d8dce5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:18:19 +0200 Subject: [PATCH 098/138] * Temporarily add PUNC name to parts_of_specch dictionary, until better solution --- spacy/parts_of_speech.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 994a48eba..7081cfab9 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -15,6 +15,7 @@ UNIV_POS_NAMES = { "VERB": VERB, "X": X, "PUNCT": PUNCT, + "PUNC": PUNCT, "SPACE": SPACE, "EOL": EOL } From 1d7f2d3abc91480d53c8886786435e8a08b5def4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:18:36 +0200 Subject: [PATCH 099/138] * Hack on morphology structs --- spacy/structs.pxd | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f3095df51..f150fa312 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport uint8_t, uint32_t, int32_t +from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t @@ -25,21 +25,16 @@ cdef struct LexemeC: float sentiment float l2_norm - -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc +cdef struct MorphFeatC: + int name + int value -cdef struct PosTag: - Morphology morph - int id +cdef struct MorphologyC: + uint64_t[4] feature_set + MorphFeatC* features univ_pos_t pos + int n cdef struct Entity: @@ -59,7 +54,7 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - Morphology morph + const MorphologyC* morph const Constituent* ctnt univ_pos_t pos bint spacy From a3d5e6c0dde9a844823ff1eefc7938d134179003 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:19:01 +0200 Subject: [PATCH 100/138] * Reform constructor and save/load workflow in parser model --- spacy/syntax/parser.pxd | 1 - spacy/syntax/parser.pyx | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 4ee30341a..70a0229c2 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,7 +11,6 @@ from .stateclass cimport StateClass cdef class Parser: - cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6282339bd..7987547fa 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -67,16 +67,22 @@ def ParserFactory(transition_system): cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, transition_system, model): + self.moves = transition_system + self.model = model + + @classmethod + def from_dir(cls, model_dir, strings, transition_system): if not os.path.exists(model_dir): print >> sys.stderr, "Warning: No model found at", model_dir elif not os.path.isdir(model_dir): print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" - else: - self.cfg = Config.read(model_dir, 'config') - self.moves = transition_system(strings, self.cfg.labels) - templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + cfg = Config.read(model_dir, 'config') + moves = transition_system(strings, cfg.labels) + templates = get_templates(cfg.features) + model = Model(moves.n_moves, templates, model_dir) + return cls(strings, moves, model) + def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) From b4faf551f545c7ef47f73d0f9efaad8374fa0f65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:19:21 +0200 Subject: [PATCH 101/138] * Refactor language-independent tagger class --- spacy/tagger.pxd | 11 ++- spacy/tagger.pyx | 223 +++++++++++++++++++++++++++++++---------------- 2 files changed, 151 insertions(+), 83 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 4aa9acc43..213781047 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -4,24 +4,23 @@ from cymem.cymem cimport Pool from ._ml cimport Model from .strings cimport StringStore -from .structs cimport TokenC, LexemeC, Morphology, PosTag +from .structs cimport TokenC, LexemeC from .parts_of_speech cimport univ_pos_t +from .vocab cimport Vocab cdef class Tagger: cdef readonly Pool mem cdef readonly StringStore strings cdef readonly Model model + cdef readonly Vocab vocab cdef public object lemmatizer cdef PreshMapArray _morph_cache cdef public dict freqs - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map cdef readonly int n_tags cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1 - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 + #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 + #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index ccb40fd22..5d015b6cc 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -6,50 +6,129 @@ from thinc.typedefs cimport atom_t, weight_t from .typedefs cimport attr_t from .tokens.doc cimport Doc -from .morphology cimport set_morph_from_dict from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE +from .attrs cimport * +from ._ml cimport arg_max + -cdef struct _CachedMorph: - Morphology morph - int lemma +cpdef enum: + P2_orth + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_flags + + P1_orth + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_flags + + W_orth + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_flags + + N1_orth + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_flags + + N2_orth + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_flags + + N_CONTEXT_FIELDS cdef class Tagger: """A part-of-speech tagger for English""" + @classmethod + def read_config(cls, data_dir): + return json.load(open(path.join(data_dir, 'pos', 'config.json'))) + + @classmethod + def default_templates(cls): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + def make_lemmatizer(self): return None - def __init__(self, StringStore strings, data_dir): + def __init__(self, Vocab vocab, templates): self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 + self.vocab = vocab + + cdef int n_tags = self.vocab.morphology.n_tags + 1 - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = self.make_lemmatizer(data_dir) + self.model = Model(n_tags, templates) self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 + self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 + @property + def tag_names(self): + return tuple(sorted(self.vocab.morphology.tag_map.keys())) + + @classmethod + def from_dir(cls, data_dir, vocab): + if path.exists(path.join(data_dir, 'templates.json')): + templates = json.loads(open(path.join(data_dir, 'templates.json'))) + else: + templates = cls.default_templates() + return cls(vocab, templates) + def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -63,18 +142,14 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - + self.vocab.morphology.assign_tag(&tokens.data[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -88,57 +163,51 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct cdef int predict(self, int i, const TokenC* tokens) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.lower + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.tag + context[6] = t.lemma + if t.lex.flags & (1 << IS_ALPHA): + context[7] = 1 + elif t.lex.flags & (1 << IS_PUNCT): + context[7] = 2 + elif t.lex.flags & (1 << LIKE_URL): + context[7] = 3 + elif t.lex.flags & (1 << LIKE_NUM): + context[7] = 4 + else: + context[7] = 0 From 119c0f8c3fae12dc33d3e52e282072c54d306738 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:20:11 +0200 Subject: [PATCH 102/138] * Hack out morphology stuff from tokenizer, while morphology being reimplemented. --- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 45 +++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a7f69c5aa..19b8aa026 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC, Morphology +from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, _Cached diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1e857aefc..38daf1c5a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from .morphology cimport set_morph_from_dict from .strings cimport hash_string cimport cython @@ -29,7 +28,7 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules, self.vocab.pos_tags) + self._load_special_tokenization(rules) @classmethod def from_dir(cls, Vocab vocab, data_dir): @@ -242,7 +241,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules, object tag_map): + def _load_special_tokenization(self, special_cases): '''Add a special-case tokenization rule. ''' cdef int i @@ -253,25 +252,15 @@ cdef class Tokenizer: cdef dict props cdef LexemeC** lexemes cdef hash_t hashed - for chunk, substrings in sorted(rules.items()): + for chunk, substrings in sorted(special_cases.items()): tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): form = props['F'] - lemma = props.get("L", None) tokens[i].lex = self.vocab.get(self.vocab.mem, form) - if lemma is not None: - tokens[i].lemma = self.vocab.strings[lemma] - else: - tokens[i].lemma = 0 - if 'pos' in props: - tokens[i].tag = self.vocab.strings[props['pos']] - tokens[i].pos = tag_map[props['pos']][0] - # These are defaults, which can be over-ridden by the - # token-specific props. - set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) - if tokens[i].lemma == 0: - tokens[i].lemma = tokens[i].lex.orth - set_morph_from_dict(&tokens[i].morph, props) + lemma = props.get('L', form) + tokens[i].lemma = self.vocab.strings[lemma] + #TODO + #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False @@ -279,3 +268,23 @@ cdef class Tokenizer: hashed = hash_string(chunk) self._specials.set(hashed, cached) self._cache.set(hashed, cached) + + +#if lemma is not None: +# tokens[i].lemma = self.vocab.strings[lemma] +#else: +# tokens[i].lemma = 0 +#if 'pos' in props: +# inflection = self.vocab.morphology.get(props['pos']) +# inflection.assign(&tokens[i]) +# # These are defaults, which can be over-ridden by the +# # token-specific props. +# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] +# #tokens[i].pos = pos +# ## These are defaults, which can be over-ridden by the +# ## token-specific props. +# #set_morph_from_dict(&tokens[i].morph, morph_features) +# #if tokens[i].lemma == 0: +# # tokens[i].lemma = tokens[i].lex.orth +##set_morph_from_dict(&tokens[i].morph, props) + From d30029979ed7d24cb56ed74f2ec3f2b910550173 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:20:46 +0200 Subject: [PATCH 103/138] * Avoid import of morphology in spans --- spacy/tokens/spans.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index f1c19f308..e2aa1a7f9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,7 +1,7 @@ from __future__ import unicode_literals from collections import defaultdict -from ..structs cimport Morphology, TokenC, LexemeC +from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t From 2d521768a30a3d8c64cf30987932d3b448ed08fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:21:03 +0200 Subject: [PATCH 104/138] * Store Morphology class in Vocab --- spacy/vocab.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index cf7a46388..5c88dca68 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64 from .structs cimport LexemeC, TokenC from .typedefs cimport utf8_t, attr_t, hash_t from .strings cimport StringStore +from .morphology cimport Morphology cdef LexemeC EMPTY_LEXEME @@ -27,6 +28,7 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings + cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer cdef public object data_dir From 1302d35dff09e64b4863a4b43df8cf37254e5c2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:21:46 +0200 Subject: [PATCH 105/138] * Rework interfaces in vocab --- spacy/vocab.pyx | 53 ++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4c35ea41c..fa196166e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -21,6 +21,7 @@ from .cfile cimport CFile from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer +from .attrs cimport PROB DEF MAX_VEC_SIZE = 100000 @@ -35,27 +36,37 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None): + @classmethod + def default_morphology(cls): + return Morphology({'VBZ': ['VERB', {}]}, [], None) + + def __init__(self, get_lex_attr=None, morphology=None, vectors=None): + self.get_lex_attr = get_lex_attr + if morphology is None: + morphology = self.default_morphology() + self.morphology = morphology + self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.get_lex_attr = get_lex_attr - self.repvec_length = 0 + self.length = 1 - self.pos_tags = pos_tags - if data_dir is not None: - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) - if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): - self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) - self._serializer = None - self.data_dir = data_dir + + @classmethod + def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, + morphology=morphology) + self.load_lexemes(path.join(data_dir, 'strings.txt'), + path.join(data_dir, 'lexemes.bin')) + if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): + self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) + return self property serializer: def __get__(self): @@ -83,7 +94,6 @@ cdef class Vocab: lex = self._by_hash.get(key) cdef size_t addr if lex != NULL: - print string, lex.orth, self.strings[string] assert lex.orth == self.strings[string] return lex else: @@ -106,17 +116,21 @@ cdef class Vocab: cdef hash_t key cdef bint is_oov = mem is not self.mem mem = self.mem - #if len(string) < 3: - # mem = self.mem + if len(string) < 3: + mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings[string] + lex.length = len(string) lex.id = self.length if self.get_lex_attr is not None: for attr, func in self.get_lex_attr.items(): value = func(string) if isinstance(value, unicode): value = self.strings[value] - Lexeme.set_struct_attr(lex, attr, value) + if attr == PROB: + lex.prob = value + else: + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: @@ -128,7 +142,6 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) - print "Add lex", key, lex.orth, self.strings[lex.orth] self.length += 1 def __iter__(self): From 658c4a39305edf7ebdb6da0c090ee09343d26644 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:06 +0200 Subject: [PATCH 106/138] * Mark test_inital as requiring models --- tests/parser/test_initial_actions_parse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index c1603cd93..cdaf25f91 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -1,6 +1,7 @@ import pytest +@pytest.mark.models def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) From ff9db9f3ae6655eb4e6c6b7ebd739228b09c3ca1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:26 +0200 Subject: [PATCH 107/138] * Fix serializer tests for new attr scheme --- tests/serialize/test_codecs.py | 21 +++------------------ tests/serialize/test_packer.py | 27 ++++++--------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py index ad9012068..00177f21a 100644 --- a/tests/serialize/test_codecs.py +++ b/tests/serialize/test_codecs.py @@ -41,25 +41,10 @@ def test_attribute(): def test_vocab_codec(): - def get_lex_props(string, prob): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - vocab = Vocab() - vocab['dog'] = get_lex_props('dog', 0.001) - vocab['the'] = get_lex_props('the', 0.05) - vocab['jumped'] = get_lex_props('jumped', 0.005) + lex = vocab['dog'] + lex = vocab['the'] + lex = vocab['jumped'] codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab]) diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py index 5770a8938..6ec583d08 100644 --- a/tests/serialize/test_packer.py +++ b/tests/serialize/test_packer.py @@ -5,6 +5,7 @@ import re import pytest import numpy +from spacy.language import Language from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer @@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer from spacy.serialize.bits import BitArray -def get_lex_props(string, prob=-22, is_oov=False): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - - @pytest.fixture def vocab(): - vocab = Vocab(get_lex_props=get_lex_props) - vocab['dog'] = get_lex_props('dog', 0.001) + vocab = Vocab(Language.default_lex_attrs()) + lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' - vocab['the'] = get_lex_props('the', 0.01) - vocab['quick'] = get_lex_props('quick', 0.005) - vocab['jumped'] = get_lex_props('jumped', 0.007) + lex = vocab['the'] + lex = vocab['quick'] + lex = vocab['jumped'] return vocab From 884251801ea4e3c26e4f9f606f6ee2c091fd488c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:50 +0200 Subject: [PATCH 108/138] * Mark space tests as requiring model --- tests/tagger/test_spaces.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tagger/test_spaces.py b/tests/tagger/test_spaces.py index c3052160e..0ef05637b 100644 --- a/tests/tagger/test_spaces.py +++ b/tests/tagger/test_spaces.py @@ -14,6 +14,7 @@ def tagged(EN): tokens = EN(string, tag=True, parse=False) return tokens +@pytest.mark.models def test_spaces(tagged): assert tagged[0].pos != SPACE assert tagged[0].pos_ != 'SPACE' From c07eea8563c0c361842caee304ec0007d40629e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:23:04 +0200 Subject: [PATCH 109/138] * Comment out old doc tests for now --- tests/test_docs.py | 155 +++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 70c8b8c63..4b0831dfd 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,80 +1,81 @@ # -*- coding: utf-8 -*- """Sphinx doctest is just too hard. Manually paste doctest examples here""" +import pytest -@pytest.mark.models -def test_1(): - import spacy.en - from spacy.parts_of_speech import ADV - # Load the pipeline, and call it with some text. - nlp = spacy.en.English() - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - tag=True, parse=False) - o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" - - o = nlp.vocab[u'back'].prob - assert o == -7.033305644989014 - o = nlp.vocab[u'not'].prob - assert o == -5.332601070404053 - o = nlp.vocab[u'quietly'].prob - assert o == -11.994928359985352 - - -@pytest.mark.models -def test2(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - -@pytest.mark.models -def test3(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - - pleaded = tokens[7] - assert pleaded.repvec.shape == (300,) - o = pleaded.repvec[:5] - assert sum(o) != 0 - from numpy import dot - from numpy.linalg import norm - - cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] - words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - words.reverse() - o = [w.orth_ for w in words[0:20]] - assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', - u'pleads', u'testified', u'conspired', u'motioned', u'demurred', - u'countersued', u'remonstrated', u'begged', u'apologised', - u'consented', u'acquiesced', u'petitioned', u'quarreled', - u'appealed', u'pleading'] - o = [w.orth_ for w in words[50:60]] - assert o == [u'martialed', u'counselled', u'bragged', - u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', - u'dissented', u'yearned'] - o = [w.orth_ for w in words[100:110]] - assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', - u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', - u'clerked'] - - #o = [w.orth_ for w in words[1000:1010]] - #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', - # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] - #o = [w.orth_ for w in words[50000:50010]] - #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', - # u'dirty', u'rims', u'artists'] +#@pytest.mark.models +#def test_1(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# # Load the pipeline, and call it with some text. +# nlp = spacy.en.English() +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", +# tag=True, parse=False) +# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) +# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" +# +# o = nlp.vocab[u'back'].prob +# assert o == -7.033305644989014 +# o = nlp.vocab[u'not'].prob +# assert o == -5.332601070404053 +# o = nlp.vocab[u'quietly'].prob +# assert o == -11.994928359985352 +# +# +#@pytest.mark.m +#def test2(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +#@pytest.mark.models +#def test3(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +# pleaded = tokens[7] +# assert pleaded.repvec.shape == (300,) +# o = pleaded.repvec[:5] +# assert sum(o) != 0 +# from numpy import dot +# from numpy.linalg import norm +# +# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) +# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] +# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) +# words.reverse() +# o = [w.orth_ for w in words[0:20]] +# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', +# u'pleads', u'testified', u'conspired', u'motioned', u'demurred', +# u'countersued', u'remonstrated', u'begged', u'apologised', +# u'consented', u'acquiesced', u'petitioned', u'quarreled', +# u'appealed', u'pleading'] +# o = [w.orth_ for w in words[50:60]] +# assert o == [u'martialed', u'counselled', u'bragged', +# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', +# u'dissented', u'yearned'] +# o = [w.orth_ for w in words[100:110]] +# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', +# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', +# u'clerked'] +# +# #o = [w.orth_ for w in words[1000:1010]] +# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', +# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] +# #o = [w.orth_ for w in words[50000:50010]] +# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', +# # u'dirty', u'rims', u'artists'] From 56c4e07a59ae7cd35b00a9de0ee0666938396104 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 08:53:48 +1000 Subject: [PATCH 110/138] Update gazetteer.json --- lang_data/en/gazetteer.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lang_data/en/gazetteer.json b/lang_data/en/gazetteer.json index 1aa6b9514..dce2e1f2a 100644 --- a/lang_data/en/gazetteer.json +++ b/lang_data/en/gazetteer.json @@ -14,8 +14,8 @@ {"orth": "9/11"} ], [ - {"lower": "Septmber"}, - {"lower": "Eleven"} + {"lower": "septmber"}, + {"lower": "eleven"} ], [ {"lower": "september"}, From 320ced276a4da0f2db54594c9fb4f7e59084c86e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 09:15:41 +0200 Subject: [PATCH 111/138] * Add tagger training script --- bin/tagger/train.py | 175 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100755 bin/tagger/train.py diff --git a/bin/tagger/train.py b/bin/tagger/train.py new file mode 100755 index 000000000..9cd8cc011 --- /dev/null +++ b/bin/tagger/train.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function + +import os +from os import path +import shutil +import codecs +import random + +import plac +import re + +import spacy.util +from spacy.en import English + +from spacy.tagger import Tagger + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + + +def score_model(scorer, nlp, raw_text, annot_tuples): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + beam_width=1, verbose=False, + use_orig_arc_eager=False): + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + templates = Tagger.default_templates() + nlp = Language(data_dir=model_dir, tagger=False) + nlp.tagger = Tagger.blank(nlp.vocab, templates) + + print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + words = annot_tuples[1] + gold_tags = annot_tuples[2] + score_model(scorer, nlp, raw_text, annot_tuples) + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(words) + else: + tokens = nlp.tokenizer(raw_text) + loss += nlp.tagger.train(tokens, gold_tags) + random.shuffle(gold_tuples) + print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc)) + nlp.end_training(model_dir) + +def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, + beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): + if not eval_only: + gold_train = list(read_json_file(train_loc)) + train(English, gold_train, model_dir, + feat_set='basic' if not debug else 'debug', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(English, list(read_json_file(dev_loc)), + model_dir, gold_preproc=gold_preproc, verbose=verbose) + print('TOK', scorer.token_acc) + print('POS', scorer.tags_acc) + print('UAS', scorer.uas) + print('LAS', scorer.las) + + print('NER P', scorer.ents_p) + print('NER R', scorer.ents_r) + print('NER F', scorer.ents_f) + + +if __name__ == '__main__': + plac.call(main) From 0af139e18376cb2286c8a53a0233fea79130c738 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 09:16:11 +0200 Subject: [PATCH 112/138] * Tagger training now working. Still need to test load/save of model. Morphology still broken. --- lang_data/en/tag_map.json | 35 ++++++++++++++++---------- spacy/_ml.pyx | 6 +++-- spacy/language.py | 30 +++++++++++----------- spacy/lemmatizer.py | 46 +++++++++++++++++++++------------- spacy/morphology.pxd | 5 +++- spacy/morphology.pyx | 21 ++++++++++------ spacy/parts_of_speech.pxd | 11 ++++++--- spacy/parts_of_speech.pyx | 16 +++++++----- spacy/tagger.pxd | 18 ++------------ spacy/tagger.pyx | 52 +++++++++++++++++++-------------------- 10 files changed, 134 insertions(+), 106 deletions(-) diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index 57d3eedee..b9f8269f7 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -1,11 +1,12 @@ { -".": {"pos": "punc", "punctype": "peri"}, -",": {"pos": "punc", "punctype": "comm"}, -"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"}, -"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"}, -"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"}, -"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"}, -":": {"pos": "punc"}, +".": {"pos": "punct", "puncttype": "peri"}, +",": {"pos": "punct", "puncttype": "comm"}, +"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"}, +"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"}, +"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"}, +"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +":": {"pos": "punct"}, "$": {"pos": "sym", "other": {"symtype": "currency"}}, "#": {"pos": "sym", "other": {"symtype": "numbersign"}}, "AFX": {"pos": "adj", "hyph": "hyph"}, @@ -13,15 +14,15 @@ "CD": {"pos": "num", "numtype": "card"}, "DT": {"pos": "adj", "prontype": "prn"}, "EX": {"pos": "adv", "advtype": "ex"}, -"FW": {"foreign": "foreign"}, -"HYPH": {"pos": "punc", "punctype": "dash"}, +"FW": {"pos": "x", "foreign": "foreign"}, +"HYPH": {"pos": "punct", "puncttype": "dash"}, "IN": {"pos": "adp"}, "JJ": {"pos": "adj", "degree": "pos"}, "JJR": {"pos": "adj", "degree": "comp"}, "JJS": {"pos": "adj", "degree": "sup"}, -"LS": {"pos": "punc", "numtype": "ord"}, +"LS": {"pos": "punct", "numtype": "ord"}, "MD": {"pos": "verb", "verbtype": "mod"}, -"NIL": {}, +"NIL": {"pos": "no_tag"}, "NN": {"pos": "noun", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, @@ -36,7 +37,7 @@ "RP": {"pos": "part"}, "SYM": {"pos": "sym"}, "TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, -"UH": {"pos": "int"}, +"UH": {"pos": "intJ"}, "VB": {"pos": "verb", "verbform": "inf"}, "VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, "VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, @@ -47,5 +48,13 @@ "WP": {"pos": "noun", "prontype": "int|rel"}, "WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, "WRB": {"pos": "adv", "prontype": "int|rel"}, -"SP": {"pos": "space"} +"SP": {"pos": "space"}, +"ADD": {"pos": "x"}, +"NFP": {"pos": "punct"}, +"GW": {"pos": "x"}, +"AFX": {"pos": "x"}, +"HYPH": {"pos": "punct"}, +"XX": {"pos": "x"}, +"BES": {"pos": "verb"}, +"HVS": {"pos": "verb"}, } diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 18908e89e..56c080fa6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -91,6 +91,8 @@ cdef class Model: count_feats(counts[guess], feats, n_feats, -cost) self._model.update(counts) - def end_training(self): + def end_training(self, model_loc=None): + if model_loc is None: + model_loc = self.model_loc self._model.end_training() - self._model.dump(self.model_loc, freq_thresh=0) + self._model.dump(model_loc, freq_thresh=0) diff --git a/spacy/language.py b/spacy/language.py index 706df34a5..2a07d1f5f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,10 @@ from os import path +try: + import ujson as json +except ImportError: + import json + from .tokenizer import Tokenizer from .morphology import Morphology from .vocab import Vocab @@ -13,6 +18,8 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager +from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD + class Language(object): @staticmethod @@ -113,14 +120,6 @@ class Language(object): attrs.IS_OOV: lambda string: True } - @classmethod - def default_dep_templates(cls): - return [] - - @classmethod - def default_ner_templates(cls): - return [] - @classmethod def default_dep_labels(cls): return {0: {'ROOT': True}} @@ -186,10 +185,11 @@ class Language(object): return None @classmethod - def default_matcher(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Matcher.from_dir(data_dir, vocab) + def default_matcher(cls, vocab, data_dir): + if path.exists(data_dir): + return Matcher.from_dir(data_dir, vocab) + else: + return None def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None): @@ -245,9 +245,9 @@ class Language(object): def end_training(self, data_dir=None): if data_dir is None: data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() + self.parser.model.end_training(path.join(data_dir, 'deps', 'model')) + self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) + self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 660a16eb9..05029391b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,29 +2,41 @@ from __future__ import unicode_literals from os import path import codecs +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import NOUN, VERB, ADJ + class Lemmatizer(object): - def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): - self.noun_id = noun_id - self.verb_id = verb_id - self.adj_id = adj_id - self.index = {} - self.exc = {} + @classmethod + def from_dir(cls, data_dir): + index = {} + exc = {} for pos in ['adj', 'adv', 'noun', 'verb']: - self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) - self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) + index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + return cls(index, exc, rules) + + def __init__(self, index, exceptions, rules): + self.index = index + self.exc = exceptions + self.rules = rules def __call__(self, string, pos): - - return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) - if pos == self.noun_id: - return self.noun(string) - elif pos == self.verb_id: - return self.verb(string) - elif pos == self.adj_id: - return self.adj(string) + if pos == NOUN: + pos = 'noun' + elif pos == VERB: + pos = 'verb' + elif pos == ADJ: + pos = 'adj' else: - raise Exception("Cannot lemmatize with unknown pos: %s" % pos) + return string + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) + return min(lemmas) def noun(self, string): return self(string, 'noun') diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 7f2ebe34b..e0f85f96f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,13 +1,16 @@ from .structs cimport TokenC +from .strings cimport StringStore cdef class Morphology: + cdef readonly object strings + cdef public object lemmatizer cdef public object tag_map cdef public object tag_names cdef public object tag_ids cdef public int n_tags - cdef int assign_tag(self, TokenC* token, int tag) except -1 + cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 cdef int assign_from_dict(self, TokenC* token, props) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f32009351..2b8fa3960 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,4 +1,5 @@ from os import path +from .lemmatizer import Lemmatizer try: import ujson as json @@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES cdef class Morphology: + @classmethod + def from_dir(cls, data_dir, lemmatizer=None): + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + if lemmatizer is None: + lemmatizer = Lemmatizer.from_dir(data_dir) + return cls(tag_map, {}, lemmatizer) + def __init__(self, tag_map, fused_tokens, lemmatizer): + self.lemmatizer = lemmatizer self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) @@ -17,15 +26,13 @@ cdef class Morphology: for i, tag_str in enumerate(self.tag_names): self.tag_ids[tag_str] = i - @classmethod - def from_dir(cls, data_dir): - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - return cls(tag_map, {}, None) - - cdef int assign_tag(self, TokenC* token, int tag) except -1: + cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: + # TODO Caching props = self.tag_map[self.tag_names[tag]] token.pos = UNIV_POS_NAMES[props['pos'].upper()] - token.tag = tag + token.tag = strings[self.tag_names[tag]] + lemma = self.lemmatizer(strings[token.lex.orth], token.pos) + token.lemma = strings[lemma] #token.inflection = # TODO cdef int assign_from_dict(self, TokenC* token, props) except -1: diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index b915b9dde..e410c6971 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -2,17 +2,22 @@ cpdef enum univ_pos_t: NO_TAG ADJ - ADV ADP + ADV + AUX CONJ DET + INTJ NOUN NUM + PART PRON - PRT + PROPN + PUNCT + SCONJ + SYM VERB X - PUNCT EOL SPACE N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 7081cfab9..8c2348a47 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -4,18 +4,22 @@ from __future__ import unicode_literals UNIV_POS_NAMES = { "NO_TAG": NO_TAG, "ADJ": ADJ, - "ADV": ADV, "ADP": ADP, + "ADV": ADV, + "AUX": AUX, "CONJ": CONJ, "DET": DET, + "INTJ": INTJ, "NOUN": NOUN, "NUM": NUM, + "PART": PART, "PRON": PRON, - "PRT": PRT, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, "VERB": VERB, "X": X, - "PUNCT": PUNCT, - "PUNC": PUNCT, - "SPACE": SPACE, - "EOL": EOL + "EOL": EOL, + "SPACE": SPACE } diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 213781047..28d7fc711 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -1,26 +1,12 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - from ._ml cimport Model -from .strings cimport StringStore -from .structs cimport TokenC, LexemeC -from .parts_of_speech cimport univ_pos_t +from .structs cimport TokenC from .vocab cimport Vocab cdef class Tagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model cdef readonly Vocab vocab - cdef public object lemmatizer - cdef PreshMapArray _morph_cache + cdef readonly Model model cdef public dict freqs - cdef readonly int n_tags - cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1 - #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 5d015b6cc..7b638c724 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -8,7 +8,7 @@ from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON -from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE +from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * from ._ml cimport arg_max @@ -102,24 +102,10 @@ cdef class Tagger: (P2_flags,), ) - def make_lemmatizer(self): - return None - - def __init__(self, Vocab vocab, templates): - self.mem = Pool() - self.vocab = vocab - - cdef int n_tags = self.vocab.morphology.n_tags + 1 - - self.model = Model(n_tags, templates) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.vocab.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - @property - def tag_names(self): - return tuple(sorted(self.vocab.morphology.tag_map.keys())) + @classmethod + def blank(cls, vocab, templates): + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + return cls(vocab, model) @classmethod def from_dir(cls, data_dir, vocab): @@ -127,7 +113,22 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - return cls(vocab, templates) + model = Model(vocab.morphology.n_tags, templates, data_dir) + return cls(vocab, model) + + def __init__(self, Vocab vocab, model): + self.vocab = vocab + self.model = model + + # TODO: Move this to tag map + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.vocab.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + @property + def tag_names(self): + return self.vocab.morphology.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -142,29 +143,28 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def train(self, Doc tokens, object gold_tag_strs): + assert len(tokens) == len(gold_tag_strs) cdef int i cdef int loss cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] correct = 0 for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct From b6b1e1aa1296f7f8a3fb0a669c290ef12853073d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:02 +0200 Subject: [PATCH 113/138] * Add link for Finnish model --- bin/init_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/init_model.py b/bin/init_model.py index 0badf71fc..e81d668aa 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -38,6 +38,7 @@ from spacy.parts_of_speech import NOUN, VERB, ADJ import spacy.en import spacy.de +import spacy.fi @@ -184,7 +185,8 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { 'en': spacy.en.English.default_lex_attrs(), - 'de': spacy.de.Deutsch.default_lex_attrs() + 'de': spacy.de.Deutsch.default_lex_attrs(), + 'fi': spacy.fi.Finnish.default_lex_attrs() } model_dir = Path(model_dir) @@ -203,6 +205,11 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) + + if (lang_data_dir / 'lemma_rules.json').exists(): + copyfile(str(lang_data_dir / 'lemma_rules.json'), + str(model_dir / 'vocab' / 'lemma_rules.json')) + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) From f0a7c99554db884aa602120d3a709f6f77419639 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:19 +0200 Subject: [PATCH 114/138] * Relax rule-requirement in lemmatizer --- spacy/lemmatizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 05029391b..5e08e80a4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -35,7 +35,7 @@ class Lemmatizer(object): pos = 'adj' else: return string - lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) return min(lemmas) def noun(self, string): From 5b89e2454c0386eba8f0a7e1e6fff901dee45dbd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:36 +0200 Subject: [PATCH 115/138] * Improve error-reporting in tagger --- spacy/tagger.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 7b638c724..dff96e6ea 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -159,7 +159,11 @@ cdef class Tagger: cdef int i cdef int loss cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + try: + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + except ValueError: + raise ValueError( + [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) correct = 0 for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) From 86c4a8e3e29b756888883d32d9e2c5f5229424c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 23:11:51 +0200 Subject: [PATCH 116/138] * Work on new morphology organization --- spacy/morphology.pyx | 95 +++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2b8fa3960..7f6afa016 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -7,6 +7,12 @@ except ImportError: import json from spacy.parts_of_speech import UNIV_POS_NAMES + + +cdef struct MorphAnalysisC: + uint64_t[4] features + attr_t lemma + attr_t pos cdef class Morphology: @@ -25,39 +31,74 @@ cdef class Morphology: self.tag_ids = {} for i, tag_str in enumerate(self.tag_names): self.tag_ids[tag_str] = i + self._cache = PreshMapArray() - cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: - # TODO Caching - props = self.tag_map[self.tag_names[tag]] - token.pos = UNIV_POS_NAMES[props['pos'].upper()] - token.tag = strings[self.tag_names[tag]] - lemma = self.lemmatizer(strings[token.lex.orth], token.pos) - token.lemma = strings[lemma] - #token.inflection = # TODO + cdef int assign_tag(self, TokenC* token, tag) except -1: + analysis = self._cache.get(tag, token.lex.orth) + if analysis is NULL: + analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) + cached = self.decode_tag(tag) + cached.lemma = self.lemmatize(token.pos, token.lex) + token.lemma = analysis.lemma + token.pos = analysis.pos + token.tag = analysis.tag + token.morph = analysis.features - cdef int assign_from_dict(self, TokenC* token, props) except -1: + cdef int assign_feature(self, TokenC* token, feature, value) except -1: pass def load_morph_exceptions(self, dict exc): - pass # Map (form, pos) to (lemma, inflection) - #cdef unicode pos_str - #cdef unicode form_str - #cdef unicode lemma_str - #cdef dict entries - #cdef dict props - #cdef int lemma - #cdef attr_t orth - #cdef int pos - #for pos_str, entries in exc.items(): - # pos = self.tag_names.index(pos_str) - # for form_str, props in entries.items(): - # lemma_str = props.get('L', form_str) - # orth = self.strings[form_str] - # cached = self.mem.alloc(1, sizeof(InflectedLemma)) - # cached.lemma = self.strings[lemma_str] - # set_morph_from_dict(&cached.morph, props) - # self._morph_cache.set(pos, orth, cached) + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + orth = self.strings[form_str] + cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) + cached.lemma = self.strings[lemma_str] + self.set_features(cached, props) + self._cache.set(pos, orth, cached) + + def _load_special_tokenization(self, special_cases): + '''Add a special-case tokenization rule. + ''' + cdef int i + cdef list substrings + cdef unicode chunk + cdef unicode form + cdef unicode lemma + cdef dict props + cdef LexemeC** lexemes + cdef hash_t hashed + for chunk, substrings in sorted(special_cases.items()): + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + form = props['F'] + tokens[i].lex = self.vocab.get(self.vocab.mem, form) + morphology = self.vocab.morphology.decode_dict(props) + tokens[i].lemma = morph_analysis.lemma + tokens[i].pos = morph_analysis.pos + tokens[i].tag = morph_analysis.tag + tokens[i].morph = morph_analysis.morph + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = tokens + hashed = hash_string(chunk) + self._specials.set(hashed, cached) + self._cache.set(hashed, cached) + + #cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: From c2307fa9ee11e883a89086de26a877382a64f343 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 28 Aug 2015 02:02:33 +0200 Subject: [PATCH 117/138] * More work on language-generic parsing --- spacy/fi/__init__.py | 11 +++ spacy/language.py | 5 +- spacy/morphology.pxd | 33 ++++++-- spacy/morphology.pyx | 181 +++++++++++-------------------------------- spacy/structs.pxd | 13 +--- spacy/tagger.pyx | 15 ++-- spacy/tokenizer.pxd | 7 +- spacy/tokenizer.pyx | 40 ++-------- spacy/tokens/doc.pxd | 4 +- spacy/tokens/doc.pyx | 2 +- spacy/vocab.pxd | 3 +- spacy/vocab.pyx | 37 +++++---- 12 files changed, 129 insertions(+), 222 deletions(-) create mode 100644 spacy/fi/__init__.py diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py new file mode 100644 index 000000000..8e7173767 --- /dev/null +++ b/spacy/fi/__init__.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language + + +class Finnish(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') diff --git a/spacy/language.py b/spacy/language.py index 2a07d1f5f..36ca5c636 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -148,13 +148,10 @@ class Language(object): vectors = cls.default_vectors(data_dir) if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs(data_dir) - if morphology is None: - morphology = cls.default_morphology(path.join(data_dir, 'vocab')) return Vocab.from_dir( path.join(data_dir, 'vocab'), get_lex_attr=get_lex_attr, - vectors=vectors, - morphology=morphology) + vectors=vectors) @classmethod def default_tokenizer(cls, vocab, data_dir): diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index e0f85f96f..eb2bb97f5 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,18 +1,41 @@ +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray +from libc.stdint cimport uint64_t + from .structs cimport TokenC from .strings cimport StringStore +from .typedefs cimport attr_t +from .parts_of_speech cimport univ_pos_t + + +cdef struct RichTagC: + uint64_t morph + int id + univ_pos_t pos + attr_t name + + +cdef struct MorphAnalysisC: + RichTagC tag + attr_t lemma cdef class Morphology: + cdef readonly Pool mem cdef readonly object strings cdef public object lemmatizer - cdef public object tag_map + cdef public object n_tags + cdef public object reverse_index cdef public object tag_names - cdef public object tag_ids - cdef public int n_tags - cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 + cdef RichTagC* rich_tags + cdef PreshMapArray _cache + + cdef int assign_tag(self, TokenC* token, tag) except -1 + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + - cdef int assign_from_dict(self, TokenC* token, props) except -1 # #cpdef enum Feature_t: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7f6afa016..acca5eb9e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,15 +6,10 @@ try: except ImportError: import json -from spacy.parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport ADJ, VERB, NOUN -cdef struct MorphAnalysisC: - uint64_t[4] features - attr_t lemma - attr_t pos - - cdef class Morphology: @classmethod def from_dir(cls, data_dir, lemmatizer=None): @@ -23,32 +18,37 @@ cdef class Morphology: lemmatizer = Lemmatizer.from_dir(data_dir) return cls(tag_map, {}, lemmatizer) - def __init__(self, tag_map, fused_tokens, lemmatizer): + def __init__(self, string_store, tag_map, lemmatizer): + self.mem = Pool() + self.strings = string_store self.lemmatizer = lemmatizer - self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) - self.tag_ids = {} - for i, tag_str in enumerate(self.tag_names): - self.tag_ids[tag_str] = i - self._cache = PreshMapArray() + self.reverse_index = {} + for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + self.rich_tags[i].id = i + self.rich_tags[i].name = self.strings[tag_str] + self.rich_tags[i].morph = 0 + self.reverse_index[self.rich_tags[i].name] = i + self._cache = PreshMapArray(self.n_tags) cdef int assign_tag(self, TokenC* token, tag) except -1: - analysis = self._cache.get(tag, token.lex.orth) + cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag + analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached = self.decode_tag(tag) - cached.lemma = self.lemmatize(token.pos, token.lex) + analysis.tag = self.rich_tags[tag_id] + analysis.lemma = self.lemmatize(tag, token.lex.orth) token.lemma = analysis.lemma - token.pos = analysis.pos - token.tag = analysis.tag - token.morph = analysis.features + token.pos = analysis.tag.pos + token.tag = analysis.tag.name + token.morph = analysis.tag.morph - cdef int assign_feature(self, TokenC* token, feature, value) except -1: + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: pass def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, inflection) + # Map (form, pos) to (lemma, rich tag) cdef unicode pos_str cdef unicode form_str cdef unicode lemma_str @@ -57,121 +57,30 @@ cdef class Morphology: cdef int lemma cdef attr_t orth cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) + for tag_str, entries in exc.items(): + tag = self.strings[tag_str] + rich_tag = self.rich_tags[self.reverse_index[tag]] for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached.lemma = self.strings[lemma_str] - self.set_features(cached, props) - self._cache.set(pos, orth, cached) + orth = self.strings[form_str] + for name_str, value_str in props.items(): + if name_str == 'L': + cached.lemma = self.strings[value_str] + else: + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth) + self._cache.set(rich_tag.pos, orth, cached) - def _load_special_tokenization(self, special_cases): - '''Add a special-case tokenization rule. - ''' - cdef int i - cdef list substrings - cdef unicode chunk - cdef unicode form - cdef unicode lemma - cdef dict props - cdef LexemeC** lexemes - cdef hash_t hashed - for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - # Set the special tokens up to have morphology and lemmas if - # specified, otherwise use the part-of-speech tag (if specified) - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - morphology = self.vocab.morphology.decode_dict(props) - tokens[i].lemma = morph_analysis.lemma - tokens[i].pos = morph_analysis.pos - tokens[i].tag = morph_analysis.tag - tokens[i].morph = morph_analysis.morph - cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) - cached.length = len(substrings) - cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - - - -#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: -# morph.number = props.get('number', 0) -# morph.tenspect = props.get('tenspect', 0) -# morph.mood = props.get('mood', 0) -# morph.gender = props.get('gender', 0) -# morph.person = props.get('person', 0) -# morph.case = props.get('case', 0) -# morph.misc = props.get('misc', 0) -# -# -#cdef class Morphology: -# cdef Pool mem -# cdef PreshMap table -# -# def __init__(self, tags, exceptions): -# pass -# -# def __getitem__(self, hash_t id_): -# pass -# -# cdef const InflectionC* get(self, hash_t key) except NULL: -# pass -# -# cdef MorphAnalysis analyse(const TokenC* token) except -1: -# cdef struct MorphAnalysis morphology -# tokens[i].pos = tag.pos -# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) -# if cached is NULL: -# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) -# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) -# cached.morph = tag.morph -# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) -# tokens[i].lemma = cached.lemma -# tokens[i].morph = cached.morph -# -# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: -# if self.lemmatizer is None: -# return lex.orth -# cdef unicode py_string = self.strings[lex.orth] -# if pos != NOUN and pos != VERB and pos != ADJ: -# return lex.orth -# cdef set lemma_strings -# cdef unicode lemma_string -# lemma_strings = self.lemmatizer(py_string, pos) -# lemma_string = sorted(lemma_strings)[0] -# lemma = self.strings[lemma_string] -# return lemma -# -# -#cdef class Inflection: -# cdef InflectionC* c -# -# def __init__(self, container, id_): -# self.c = container[id_] -# self.container = container -# -# for i, feat_id in enumerate(feat_ids): -# feature, value = parse_id(feat_id) -# self.add_value(feature, value, True) -# -# def has(self, Value_t feat_value_id): -# part = feat_value_id % 64 -# bit = feat_value_id / 64 -# if self.value_set[part] & bit: -# return True -# else: -# return False -# -# property pos: def __get__(self): return self.c.pos -# -# property id: def __get__(self): return self.c.id -# -# property features: -# pass + def lemmatize(self, const univ_pos_t pos, attr_t orth): + if self.lemmatizer is None: + return orth + cdef unicode py_string = self.strings[orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f150fa312..a0a3d65a3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -25,17 +25,6 @@ cdef struct LexemeC: float sentiment float l2_norm -cdef struct MorphFeatC: - int name - int value - - -cdef struct MorphologyC: - uint64_t[4] feature_set - MorphFeatC* features - univ_pos_t pos - int n - cdef struct Entity: int start @@ -54,8 +43,8 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - const MorphologyC* morph const Constituent* ctnt + uint64_t morph univ_pos_t pos bint spacy int tag diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index dff96e6ea..6fea4af88 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -104,7 +104,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = Model(vocab.morphology.n_tags, templates, model_loc=None) + model = Model(vocab.n_tags, templates, model_loc=None) return cls(vocab, model) @classmethod @@ -113,7 +113,7 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - model = Model(vocab.morphology.n_tags, templates, data_dir) + model = Model(vocab.n_tags, templates, data_dir) return cls(vocab, model) def __init__(self, Vocab vocab, model): @@ -128,7 +128,7 @@ cdef class Tagger: @property def tag_names(self): - return self.vocab.morphology.tag_names + return self.vocab.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -143,14 +143,15 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -168,7 +169,9 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 19b8aa026..9d60d2a6e 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -7,12 +7,7 @@ from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc -from .vocab cimport Vocab, _Cached - - -cdef union LexemesOrTokens: - const LexemeC* const* lexemes - TokenC* tokens +from .vocab cimport Vocab, LexemesOrTokens, _Cached cdef class Tokenizer: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 38daf1c5a..d54770d2b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -192,9 +192,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - else: + if not cache_hit: match = self.find_infix(string) if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) @@ -253,38 +251,10 @@ cdef class Tokenizer: cdef LexemeC** lexemes cdef hash_t hashed for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - lemma = props.get('L', form) - tokens[i].lemma = self.vocab.strings[lemma] - #TODO - #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - -#if lemma is not None: -# tokens[i].lemma = self.vocab.strings[lemma] -#else: -# tokens[i].lemma = 0 -#if 'pos' in props: -# inflection = self.vocab.morphology.get(props['pos']) -# inflection.assign(&tokens[i]) -# # These are defaults, which can be over-ridden by the -# # token-specific props. -# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] -# #tokens[i].pos = pos -# ## These are defaults, which can be over-ridden by the -# ## token-specific props. -# #set_morph_from_dict(&tokens[i].morph, morph_features) -# #if tokens[i].lemma == 0: -# # tokens[i].lemma = tokens[i].lex.orth -##set_morph_from_dict(&tokens[i].morph, props) - + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 121018770..a13858175 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr -ctypedef TokenC* TokenC_ptr +ctypedef const TokenC* const_TokenC_ptr ctypedef fused LexemeOrToken: const_Lexeme_ptr - TokenC_ptr + const_TokenC_ptr cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0fa562dfb..80facc8db 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -209,7 +209,7 @@ cdef class Doc: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - if LexemeOrToken is TokenC_ptr: + if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 5c88dca68..d9bf32582 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: const LexemeC* const* lexemes - TokenC* tokens + const TokenC* tokens cdef struct _Cached: @@ -37,6 +37,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index fa196166e..085fb38f9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,6 +17,7 @@ from .strings cimport hash_string from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile +from .lemmatizer import Lemmatizer from cymem.cymem cimport Address from . import util @@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - @classmethod - def default_morphology(cls): - return Morphology({'VBZ': ['VERB', {}]}, [], None) - - def __init__(self, get_lex_attr=None, morphology=None, vectors=None): - self.get_lex_attr = get_lex_attr - if morphology is None: - morphology = self.default_morphology() - self.morphology = morphology - + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) self.length = 1 self._serializer = None @@ -60,10 +54,9 @@ cdef class Vocab: raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, - morphology=morphology) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) return self @@ -172,6 +165,22 @@ cdef class Vocab: orth = id_or_string return Lexeme(self, orth) + cdef const TokenC* make_fused_token(self, substrings) except NULL: + cdef int i + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + token = &tokens[i] + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + token.lex = self.get(self.mem, props['F']) + if 'pos' in props: + self.morphology.assign_tag(token, props['pos']) + if 'L' in props: + tokens[i].lemma = self.strings[props['L']] + for feature, value in props.get('morph', {}).items(): + self.morphology.assign_feature(&token.morph, feature, value) + return tokens + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) From 534e3dda3cbd4f8677fc30f75879bfc3225a6b2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 28 Aug 2015 03:44:54 +0200 Subject: [PATCH 118/138] * More work on language independent parsing --- spacy/language.py | 7 +------ spacy/morphology.pxd | 2 +- spacy/morphology.pyx | 21 ++++++++++----------- spacy/tagger.pyx | 6 +++--- spacy/vocab.pyx | 4 +++- 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 36ca5c636..881df7d1a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,7 +6,6 @@ except ImportError: import json from .tokenizer import Tokenizer -from .morphology import Morphology from .vocab import Vocab from .syntax.parser import Parser from .tagger import Tagger @@ -132,16 +131,12 @@ class Language(object): def default_data_dir(cls): return path.join(path.dirname(__file__), 'data') - @classmethod - def default_morphology(cls, data_dir): - return Morphology.from_dir(data_dir) - @classmethod def default_vectors(cls, data_dir): return None @classmethod - def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None): + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None): if data_dir is None: data_dir = cls.default_data_dir() if vectors is None: diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index eb2bb97f5..2229da0ad 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -22,7 +22,7 @@ cdef struct MorphAnalysisC: cdef class Morphology: cdef readonly Pool mem - cdef readonly object strings + cdef readonly StringStore strings cdef public object lemmatizer cdef public object n_tags cdef public object reverse_index diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index acca5eb9e..12d435c7d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN cdef class Morphology: - @classmethod - def from_dir(cls, data_dir, lemmatizer=None): - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - if lemmatizer is None: - lemmatizer = Lemmatizer.from_dir(data_dir) - return cls(tag_map, {}, lemmatizer) - - def __init__(self, string_store, tag_map, lemmatizer): + def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() self.strings = string_store self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) + self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} + + self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) for i, (tag_str, props) in enumerate(sorted(tag_map.items())): self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] @@ -33,12 +28,16 @@ cdef class Morphology: self._cache = PreshMapArray(self.n_tags) cdef int assign_tag(self, TokenC* token, tag) except -1: - cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag + cdef int tag_id + if isinstance(tag, basestring): + tag_id = self.reverse_index[self.strings[tag]] + else: + tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis.tag = self.rich_tags[tag_id] - analysis.lemma = self.lemmatize(tag, token.lex.orth) + analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 6fea4af88..756bb7ea4 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -104,7 +104,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = Model(vocab.n_tags, templates, model_loc=None) + model = Model(vocab.morphology.n_tags, templates, model_loc=None) return cls(vocab, model) @classmethod @@ -113,7 +113,7 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - model = Model(vocab.n_tags, templates, data_dir) + model = Model(vocab.morphology.n_tags, templates, data_dir) return cls(vocab, model) def __init__(self, Vocab vocab, model): @@ -128,7 +128,7 @@ cdef class Tagger: @property def tag_names(self): - return self.vocab.tag_names + return self.vocab.morphology.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 085fb38f9..596570a98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,13 +49,15 @@ cdef class Vocab: self._serializer = None @classmethod - def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None): + def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) From fd1eeb3102d65504d0267861355d61e24e731086 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 04:13:03 +0200 Subject: [PATCH 119/138] * Add POS attribute support in get_attr --- spacy/tokens/doc.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..955e9b45f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,6 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN +from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport check_flag from ..lexeme cimport get_attr as get_lex_attr from .spans cimport Span @@ -327,6 +328,9 @@ cdef class Doc: elif attr_id == TAG: for i in range(length): tokens[i].tag = values[i] + elif attr_id == POS: + for i in range(length): + tokens[i].pos = values[i] elif attr_id == DEP: for i in range(length): tokens[i].dep = values[i] From 5edac11225b4435daac5776dd52ca105bc1d5233 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 04:15:00 +0200 Subject: [PATCH 120/138] * Wrap self.parse in nogil, and break if an invalid move is predicted. The invalid break is a work-around that papers over likely bugs, but we can't easily break in the nogil block, and otherwise we'll get an infinite loop. Need to set this as an error flag. --- spacy/syntax/parser.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6282339bd..59b90920c 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -84,8 +84,7 @@ cdef class Parser: cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE, self.model.n_feats, self.model.n_feats) - with nogil: - self.parse(stcls, eg.c) + self.parse(stcls, eg.c) tokens.set_parse(stcls._sent) cdef void predict(self, StateClass stcls, ExampleC* eg) nogil: @@ -98,6 +97,8 @@ cdef class Parser: cdef void parse(self, StateClass stcls, ExampleC eg) nogil: while not stcls.is_final(): self.predict(stcls, &eg) + if not eg.is_valid[eg.guess]: + break self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label) self.moves.finalize_state(stcls) From 571b6eda88bb72078b88b9a600455cb8ed3ab622 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 05:40:10 +0200 Subject: [PATCH 121/138] * Upd tests --- tests/parser/test_initial_actions_parse.py | 5 ++++- tests/test_matcher.py | 12 ++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index c1603cd93..9f570d8be 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -4,7 +4,10 @@ import pytest def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) - next_actions = EN.parser.partial(doc, ['L-nsubj', 'S', 'L-det']) + with EN.parser.step_through(doc) as stepwise: + stepwise.transition('L-nsubj') + stepwise.transition('S') + stepwise.transition('L-det') assert doc[0].head.i == 1 assert doc[1].head.i == 1 assert doc[2].head.i == 3 diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 0014e1110..1b748cb53 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -3,7 +3,7 @@ import pytest from spacy.strings import StringStore from spacy.matcher import * -from spacy.attrs import ORTH +from spacy.attrs import LOWER from spacy.tokens.doc import Doc from spacy.vocab import Vocab @@ -13,7 +13,7 @@ def matcher(EN): patterns = { 'Javascript': ['PRODUCT', {}, [[{'ORTH': 'JavaScript'}]]], 'GoogleNow': ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]], - 'Java': ['PRODUCT', {}, [[{'ORTH': 'Java'}]]], + 'Java': ['PRODUCT', {}, [[{'LOWER': 'java'}]]], } return Matcher(EN.vocab, patterns) @@ -33,7 +33,7 @@ def test_match_start(matcher, EN): def test_match_end(matcher, EN): - tokens = EN('I like Java') + tokens = EN('I like java') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 3)] @@ -43,17 +43,17 @@ def test_match_middle(matcher, EN): def test_match_multi(matcher, EN): - tokens = EN('I like Google Now and Java best') + tokens = EN('I like Google Now and java best') assert matcher(tokens) == [(EN.vocab.strings['PRODUCT'], 2, 4), (EN.vocab.strings['PRODUCT'], 5, 6)] def test_match_preserved(matcher, EN): - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') EN.tagger(doc) EN.entity(doc) assert len(doc.ents) == 0 - doc = EN.tokenizer('I like Java') + doc = EN.tokenizer('I like java') matcher(doc) assert len(doc.ents) == 1 EN.tagger(doc) From 7e4fea67d39dd85b9aeed396a055cdd7e4e31971 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 10:48:36 +0200 Subject: [PATCH 122/138] * Fix bug in token subtree, introduced by duplication of L/R code in Stateclass. Need to consolidate the two methods. --- spacy/tokens/token.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..cc50fdd08 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -142,7 +142,7 @@ cdef class Token: """The leftward immediate children of the word, in the syntactic dependency parse. """ - cdef const TokenC* ptr = self.c - self.i + cdef const TokenC* ptr = self.c - (self.i - self.c.l_edge) while ptr < self.c: # If this head is still to the right of us, we can skip to it # No token that's between this token and this head could be our @@ -160,7 +160,7 @@ cdef class Token: def __get__(self): """The rightward immediate children of the word, in the syntactic dependency parse.""" - cdef const TokenC* ptr = (self.c - self.i) + (self.array_len - 1) + cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] while ptr > self.c: # If this head is still to the right of us, we can skip to it From 950ce3666084aae00c5f8300515db8004c86198c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:51:30 +0200 Subject: [PATCH 123/138] * Update init model --- bin/init_model.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index e81d668aa..cffd9df96 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -20,6 +20,7 @@ from __future__ import unicode_literals from ast import literal_eval import math import gzip +import json import plac from pathlib import Path @@ -39,6 +40,7 @@ from spacy.parts_of_speech import NOUN, VERB, ADJ import spacy.en import spacy.de import spacy.fi +import spacy.it @@ -143,7 +145,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(get_lex_attr, src_dir, dst_dir): +def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -152,7 +154,7 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(get_lex_attr=get_lex_attr) + vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: @@ -186,7 +188,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { 'en': spacy.en.English.default_lex_attrs(), 'de': spacy.de.Deutsch.default_lex_attrs(), - 'fi': spacy.fi.Finnish.default_lex_attrs() + 'fi': spacy.fi.Finnish.default_lex_attrs(), + 'it': spacy.it.Italian.default_lex_attrs(), } model_dir = Path(model_dir) @@ -199,8 +202,9 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): if not model_dir.exists(): model_dir.mkdir() + tag_map = json.load((lang_data_dir / 'tag_map.json').open()) setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(languages[lang_id], corpora_dir, model_dir / 'vocab') + setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), From d1eea2d865b0b42d02195143788343ea3eb620b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:51:48 +0200 Subject: [PATCH 124/138] * Update train.py for language-generic spaCy --- bin/parser/train.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 68217fcb3..abd5eb16e 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -14,7 +14,6 @@ import re import spacy.util from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.util import Config from spacy.gold import read_json_file @@ -22,6 +21,11 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer +from spacy.syntax.arc_eager import ArcEager +from spacy.syntax.ner import BiluoPushDown +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser + def _corrupt(c, noise_level): if random.random() >= noise_level: @@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', beam_width=1, verbose=False, use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width) Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - nlp = Language(data_dir=model_dir) - + nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) + nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() @@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) - nlp.end_training() + nlp.end_training(model_dir) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): From e35bb36be75eb90daf9ff5ef0d79cfb14940a281 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:52:32 +0200 Subject: [PATCH 125/138] * Ensure Lexeme.check_flag returns a boolean value --- spacy/lexeme.pxd | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 510840b2b..6f333829f 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -35,7 +35,10 @@ cdef class Lexeme: @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: if feat_name < (sizeof(flags_t) * 8): - return Lexeme.check_flag(lex, feat_name) + if Lexeme.check_flag(lex, feat_name): + return 1 + else: + return 0 elif feat_name == ID: return lex.id elif feat_name == ORTH: @@ -78,7 +81,10 @@ cdef class Lexeme: @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) + if lexeme.flags & (1 << flag_id): + return True + else: + return False @staticmethod cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: From 7cc56ada6eaa5d662f044cab3c12ece1035c5274 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:52:51 +0200 Subject: [PATCH 126/138] * Temporarily add py_set_flag attribute in Lexeme --- spacy/lexeme.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e99bcfa7c..2c69a527c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -29,6 +29,12 @@ cdef class Lexeme: self.c = vocab.get_by_orth(vocab.mem, orth) assert self.c.orth == orth + def py_set_flag(self, attr_id_t flag_id): + Lexeme.set_flag(self.c, flag_id, True) + + def py_check_flag(self, attr_id_t flag_id): + return True if Lexeme.check_flag(self.c, flag_id) else False + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] From 6427a3fcac3035e78a6f004f8e36e401a68acc1c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:53:12 +0200 Subject: [PATCH 127/138] * Temporarily import flag attributes in matcher --- spacy/matcher.pyx | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2cc91a368..f6f1ad3ba 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,6 +8,7 @@ from cymem.cymem cimport Pool from libcpp.vector cimport vector from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab @@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1: cdef int i for i in range(pattern.length): if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: + print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value + print get_token_attr(token, pattern.spec[i].attr) return False return True @@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store): attr = map_attr_name(attr) if isinstance(value, basestring): value = string_store[value] + if isinstance(value, bool): + value = int(value) converted[-1].append((attr, value)) + print "Converted", converted[-1] return converted @@ -92,6 +98,32 @@ def map_attr_name(attr): return SHAPE elif attr == 'NORM': return NORM + elif attr == 'FLAG13': + return FLAG13 + elif attr == 'FLAG14': + return FLAG14 + elif attr == 'FLAG15': + return FLAG15 + elif attr == 'FLAG16': + return FLAG16 + elif attr == 'FLAG17': + return FLAG17 + elif attr == 'FLAG18': + return FLAG18 + elif attr == 'FLAG19': + return FLAG19 + elif attr == 'FLAG20': + return FLAG20 + elif attr == 'FLAG21': + return FLAG21 + elif attr == 'FLAG22': + return FLAG22 + elif attr == 'FLAG23': + return FLAG23 + elif attr == 'FLAG24': + return FLAG24 + elif attr == 'FLAG25': + return FLAG25 else: raise Exception("TODO: Finish supporting attr mapping %s" % attr) @@ -130,6 +162,7 @@ cdef class Matcher: # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: + assert len(spec) >= 1 spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @@ -142,11 +175,13 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): + print 'check', doc[token_i].orth_ token = &doc.data[token_i] q = 0 for i in range(partials.size()): state = partials.at(i) if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: @@ -156,6 +191,7 @@ cdef class Matcher: for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: From 9eae9837c4b97a681fdfe52f6380ab56fe7b6065 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:53:39 +0200 Subject: [PATCH 128/138] * Fix morphology look up --- spacy/morphology.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 12d435c7d..fc6a4936b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -30,7 +30,11 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): - tag_id = self.reverse_index[self.strings[tag]] + try: + tag_id = self.reverse_index[self.strings[tag]] + except KeyError: + print tag + raise else: tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) From c9f2082e3cb09484cad34ea30505b8dc5dd2bf41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:54:51 +0200 Subject: [PATCH 129/138] * Fix compilation error in en/tag_map.json --- lang_data/en/tag_map.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index b9f8269f7..8678e5afe 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -56,5 +56,5 @@ "HYPH": {"pos": "punct"}, "XX": {"pos": "x"}, "BES": {"pos": "verb"}, -"HVS": {"pos": "verb"}, +"HVS": {"pos": "verb"} } From 238b2f533ba3512266c470061a3c393f6eaa9b63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:55:53 +0200 Subject: [PATCH 130/138] * Add lemma rules --- lang_data/de/lemma_rules.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 lang_data/de/lemma_rules.json diff --git a/lang_data/de/lemma_rules.json b/lang_data/de/lemma_rules.json new file mode 100644 index 000000000..e69de29bb From b3703836f9c58af6c3bb0c1c6698e31322feeb1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:56:11 +0200 Subject: [PATCH 131/138] * Add en lemma rules --- lang_data/en/lemma_rules.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 lang_data/en/lemma_rules.json diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json new file mode 100644 index 000000000..c45eb1df6 --- /dev/null +++ b/lang_data/en/lemma_rules.json @@ -0,0 +1,31 @@ +{ + "noun": [ + ["s", ""], + ["ses", "s"], + ["ves", "f"], + ["xes", "x"], + ["zes", "z"], + ["ches", "ch"], + ["shes", "sh"], + ["men", "man"], + ["ies", "y"] + ], + + "verb": [ + ["s", ""], + ["ies", "y"], + ["es", "e"], + ["es", ""], + ["ed", "e"], + ["ed", ""], + ["ing", "e"], + ["ing", ""] + ], + + "adj": [ + ["er", ""], + ["est", ""], + ["er", "e"], + ["est", "e"] + ] +} From 80a66c0159b10f22f545c5515cbc1b6ff096976e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 18:43:44 +0200 Subject: [PATCH 132/138] * Add draft finnish stuff --- lang_data/fi/infix.txt | 3 + lang_data/fi/lemma_rules.json | 1 + lang_data/fi/morphs.json | 0 lang_data/fi/prefix.txt | 21 +++++ lang_data/fi/sample.txt | 3 + lang_data/fi/specials.json | 149 ++++++++++++++++++++++++++++++++++ lang_data/fi/suffix.txt | 26 ++++++ lang_data/fi/tag_map.json | 17 ++++ 8 files changed, 220 insertions(+) create mode 100644 lang_data/fi/infix.txt create mode 100644 lang_data/fi/lemma_rules.json create mode 100644 lang_data/fi/morphs.json create mode 100644 lang_data/fi/prefix.txt create mode 100644 lang_data/fi/sample.txt create mode 100644 lang_data/fi/specials.json create mode 100644 lang_data/fi/suffix.txt create mode 100644 lang_data/fi/tag_map.json diff --git a/lang_data/fi/infix.txt b/lang_data/fi/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/fi/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/fi/lemma_rules.json b/lang_data/fi/lemma_rules.json new file mode 100644 index 000000000..0967ef424 --- /dev/null +++ b/lang_data/fi/lemma_rules.json @@ -0,0 +1 @@ +{} diff --git a/lang_data/fi/morphs.json b/lang_data/fi/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/fi/prefix.txt b/lang_data/fi/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/fi/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/fi/sample.txt b/lang_data/fi/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/fi/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/fi/specials.json b/lang_data/fi/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/fi/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/fi/suffix.txt b/lang_data/fi/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/fi/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/fi/tag_map.json b/lang_data/fi/tag_map.json new file mode 100644 index 000000000..6b21a1e29 --- /dev/null +++ b/lang_data/fi/tag_map.json @@ -0,0 +1,17 @@ +{ + "NOUN": {"pos": "NOUN"}, + "VERB": {"pos": "VERB"}, + "PUNCT": {"pos": "PUNCT"}, + "ADV": {"pos": "ADV"}, + "ADJ": {"pos": "ADJ"}, + "PRON": {"pos": "PRON"}, + "PROPN": {"pos": "PROPN"}, + "CONJ": {"pos": "CONJ"}, + "NUM": {"pos": "NUM"}, + "AUX": {"pos": "AUX"}, + "SCONJ": {"pos": "SCONJ"}, + "ADP": {"pos": "ADP"}, + "SYM": {"pos": "SYM"}, + "X": {"pos": "X"}, + "INTJ": {"pos": "INTJ"} +} From 577418986a49f02d972b510bca0cc28610c45a46 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 18:44:10 +0200 Subject: [PATCH 133/138] * Add draft Italian stuff --- lang_data/it/infix.txt | 3 + lang_data/it/morphs.json | 0 lang_data/it/prefix.txt | 21 ++++++ lang_data/it/specials.json | 149 +++++++++++++++++++++++++++++++++++++ lang_data/it/suffix.txt | 26 +++++++ lang_data/it/tag_map.json | 44 +++++++++++ 6 files changed, 243 insertions(+) create mode 100644 lang_data/it/infix.txt create mode 100644 lang_data/it/morphs.json create mode 100644 lang_data/it/prefix.txt create mode 100644 lang_data/it/specials.json create mode 100644 lang_data/it/suffix.txt create mode 100644 lang_data/it/tag_map.json diff --git a/lang_data/it/infix.txt b/lang_data/it/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/it/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/it/morphs.json b/lang_data/it/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/it/prefix.txt b/lang_data/it/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/it/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/it/specials.json b/lang_data/it/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/it/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/it/suffix.txt b/lang_data/it/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/it/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/it/tag_map.json b/lang_data/it/tag_map.json new file mode 100644 index 000000000..514e978a6 --- /dev/null +++ b/lang_data/it/tag_map.json @@ -0,0 +1,44 @@ +{ +"S": {"pos": "NOUN"}, +"E": {"pos": "ADP"}, +"RD": {"pos": "DET"}, +"V": {"pos": "VER"}, +"_": {"pos": "_"}, +"A": {"pos": "ADJ"}, +"SP": {"pos": "PROP"}, +"FF": {"pos": "PUNC"}, +"FS": {"pos": "PUNC"}, +"B": {"pos": "ADV"}, +"CC": {"pos": "CON"}, +"FB": {"pos": "PUNC"}, +"VA": {"pos": "AUX"}, +"PC": {"pos": "PRO"}, +"N": {"pos": "NUM"}, +"RI": {"pos": "DET"}, +"PR": {"pos": "PRO"}, +"CS": {"pos": "SCON"}, +"BN": {"pos": "ADV"}, +"AP": {"pos": "DET"}, +"VM": {"pos": "AUX"}, +"DI": {"pos": "DET"}, +"FC": {"pos": "PUNC"}, +"PI": {"pos": "PRO"}, +"DD": {"pos": "DET"}, +"DQ": {"pos": "DET"}, +"PQ": {"pos": "PRO"}, +"PD": {"pos": "PRO"}, +"NO": {"pos": "ADJ"}, +"PE": {"pos": "PRO"}, +"T": {"pos": "DET"}, +"X": {"pos": "SYM"}, +"SW": {"pos": "X"}, +"NO": {"pos": "PRO"}, +"I": {"pos": "INT"}, +"X": {"pos": "X"}, +"DR": {"pos": "DET"}, +"EA": {"pos": "ADP"}, +"PP": {"pos": "PRO"}, +"X": {"pos": "NUM"}, +"DE": {"pos": "DET"}, +"X": {"pos": "PAR"} +} From d2fc104a26b8832162847b946d0d3973e95cfaaa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 19:45:15 +0200 Subject: [PATCH 134/138] * Begin merge of Gazetteer and DE branches --- spacy/lexeme.pxd | 50 +++++++++++++++++++++++++++++------------- spacy/lexeme.pyx | 30 +++++++++++-------------- spacy/matcher.pyx | 7 +++--- spacy/tokens/doc.pyx | 5 ++--- spacy/tokens/token.pyx | 26 +++++++++++----------- spacy/vocab.pxd | 1 + spacy/vocab.pyx | 12 +++++----- 7 files changed, 74 insertions(+), 57 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 321f7c616..130966765 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray @@ -15,21 +16,31 @@ cdef class Lexeme: cdef readonly Vocab vocab cdef readonly attr_t orth - cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: - lex.length = props['length'] - lex.orth = vocab.strings[props['orth']] - lex.lower = vocab.strings[props['lower']] - lex.norm = vocab.strings[props['norm']] - lex.shape = vocab.strings[props['shape']] - lex.prefix = vocab.strings[props['prefix']] - lex.suffix = vocab.strings[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] - lex.repvec = empty_vec + @staticmethod + cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): + cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) + self.c = lex + self.vocab = vocab + self.orth = lex.orth + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -56,5 +67,14 @@ cdef class Lexeme: else: return 0 + @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + @staticmethod + cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil: + cdef flags_t one = 1 + if value: + lexeme.flags |= one << flag_id + else: + lexeme.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0b3303f1..832f4fec7 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,12 +26,8 @@ cdef class Lexeme: def __init__(self, Vocab vocab, int orth): self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(orth) + self.c = vocab.get_by_orth(vocab.mem, orth) - property orth: - def __get__(self): - return self.c.orth - property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -78,44 +74,44 @@ cdef class Lexeme: property is_oov: def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_OOV, x) property is_alpha: def __get__(self): return Lexeme.check_flag(self.c, IS_ALPHA) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ALPHA, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ALPHA, x) property is_ascii: def __get__(self): return Lexeme.check_flag(self.c, IS_ASCII) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_ASCII, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_ASCII, x) property is_digit: def __get__(self): return Lexeme.check_flag(self.c, IS_DIGIT) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_DIGIT, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_DIGIT, x) property is_lower: def __get__(self): return Lexeme.check_flag(self.c, IS_LOWER) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_LOWER, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_LOWER, x) property is_title: def __get__(self): return Lexeme.check_flag(self.c, IS_TITLE) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_TITLE, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_TITLE, x) property is_punct: def __get__(self): return Lexeme.check_flag(self.c, IS_PUNCT) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_PUNCT, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_PUNCT, x) property is_space: def __get__(self): return Lexeme.check_flag(self.c, IS_SPACE) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_SPACE, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, IS_SPACE, x) property like_url: def __get__(self): return Lexeme.check_flag(self.c, LIKE_URL) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: def __get__(self): return Lexeme.check_flag(self.c, LIKE_EMAIL) - def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) + def __set__(self, bint x): Lexeme.set_flag(self.c, LIKE_EMAIL, x) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 72473b073..caafe6498 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -102,21 +102,22 @@ cdef class Matcher: cdef readonly int n_patterns def __init__(self, vocab, patterns): + self.vocab = vocab self.mem = Pool() for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] + entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): - etype = vocab.strings[etype] + etype = self.vocab.strings[etype] elif etype is None: etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: - spec = _convert_strings(spec, vocab.strings) + spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 955e9b45f..4ba0d675a 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -5,6 +5,7 @@ from libc.stdint cimport uint32_t import numpy import struct +from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport attr_id_t @@ -13,8 +14,6 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -48,7 +47,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index cc50fdd08..2fa1366a1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -9,6 +8,7 @@ np.import_array() import numpy +from ..lexeme cimport Lexeme from ..parts_of_speech import UNIV_POS_NAMES from ..attrs cimport LEMMA @@ -42,7 +42,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -286,37 +286,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..710a1b5ec 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -37,6 +37,7 @@ cdef class Vocab: cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef PreshMap _by_hash cdef PreshMap _by_orth diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dcb7d575c..2d67e59f2 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,7 +12,6 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape @@ -36,12 +35,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None): + def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=False): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} + #self.pos_tags = pos_tags if pos_tags is not None else {} + self.pos_tags = {} self.get_lex_attr = get_lex_attr self.repvec_length = 0 @@ -112,7 +112,7 @@ cdef class Vocab: if is_oov: lex.id = 0 else: - self._add_lex_to_vocab(key, lex) + self._add_lex_to_vocab(hash_string(string), lex) assert lex != NULL, string return lex @@ -125,7 +125,7 @@ cdef class Vocab: cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme.from_ptr(addr, self, self.repvec_length) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -157,7 +157,7 @@ cdef class Vocab: raise ValueError("Vocab unable to map type: " "%s. Maps unicode --> Lexeme or " "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) + return Lexeme.from_ptr(lexeme, self, self.repvec_length) def dump(self, loc): if path.exists(loc): From 5ad4527c421884895dde14cfa24f2643bbf2cbe6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 20:18:58 +0200 Subject: [PATCH 135/138] * Rename Deutsch to German --- bin/init_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/init_model.py b/bin/init_model.py index cffd9df96..0780d3c4b 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -187,7 +187,7 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { 'en': spacy.en.English.default_lex_attrs(), - 'de': spacy.de.Deutsch.default_lex_attrs(), + 'de': spacy.de.German.default_lex_attrs(), 'fi': spacy.fi.Finnish.default_lex_attrs(), 'it': spacy.it.Italian.default_lex_attrs(), } From 4f8e38271d4c5edc75d2a8000aa2c5be6fb462fd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 20:19:08 +0200 Subject: [PATCH 136/138] * Fix merge errors in lexeme.pxd --- spacy/lexeme.pxd | 51 ------------------------------------------------ 1 file changed, 51 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index f4f8d1e7f..63280155c 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -17,7 +17,6 @@ cdef class Lexeme: cdef readonly attr_t orth @staticmethod -<<<<<<< HEAD cdef inline Lexeme from_ptr(LexemeC* lex, Vocab vocab, int vector_length): cdef Lexeme self = Lexeme.__new__(Lexeme, vocab, lex.orth) self.c = lex @@ -42,22 +41,6 @@ cdef class Lexeme: lex.suffix = value elif name == CLUSTER: lex.cluster = value -======= - cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: - lex.length = props['length'] - lex.orth = vocab.strings[props['orth']] - lex.lower = vocab.strings[props['lower']] - lex.norm = vocab.strings[props['norm']] - lex.shape = vocab.strings[props['shape']] - lex.prefix = vocab.strings[props['prefix']] - lex.suffix = vocab.strings[props['suffix']] - - lex.cluster = props['cluster'] - lex.prob = props['prob'] - lex.sentiment = props['sentiment'] - - lex.flags = props['flags'] ->>>>>>> de @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -87,43 +70,10 @@ cdef class Lexeme: else: return 0 - @staticmethod - cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: - if name < (sizeof(flags_t) * 8): - Lexeme.set_flag(lex, name, value) - elif name == ID: - lex.id = value - elif name == LOWER: - lex.lower = value - elif name == NORM: - lex.norm = value - elif name == SHAPE: - lex.shape = value - elif name == PREFIX: - lex.prefix = value - elif name == SUFFIX: - lex.suffix = value - elif name == CLUSTER: - lex.cluster = value - @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: -<<<<<<< HEAD return lexeme.flags & (1 << flag_id) - @staticmethod - cdef inline bint set_flag(LexemeC* lexeme, attr_id_t flag_id, bint value) nogil: - cdef flags_t one = 1 - if value: - lexeme.flags |= one << flag_id - else: - lexeme.flags &= ~(one << flag_id) -======= - if lexeme.flags & (1 << flag_id): - return True - else: - return False - @staticmethod cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: cdef flags_t one = 1 @@ -131,4 +81,3 @@ cdef class Lexeme: lex.flags |= one << flag_id else: lex.flags &= ~(one << flag_id) ->>>>>>> de From f6ec5bf1b0485a64e25a9b1828fadde5884b5f10 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 20:19:27 +0200 Subject: [PATCH 137/138] * Use empty tag map in vocab if none supplied --- spacy/vocab.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 596570a98..012909755 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -38,6 +38,8 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): + if tag_map is None: + tag_map = {} self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() From fc8f7b123d27aa4f4a00e9684f00855f5217bae3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 20:19:51 +0200 Subject: [PATCH 138/138] * Mark a matcher test as requiring the model --- tests/test_matcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_matcher.py b/tests/test_matcher.py index 1b748cb53..986d8a8bd 100644 --- a/tests/test_matcher.py +++ b/tests/test_matcher.py @@ -48,10 +48,10 @@ def test_match_multi(matcher, EN): (EN.vocab.strings['PRODUCT'], 5, 6)] +@pytest.mark.models def test_match_preserved(matcher, EN): doc = EN.tokenizer('I like java') EN.tagger(doc) - EN.entity(doc) assert len(doc.ents) == 0 doc = EN.tokenizer('I like java') matcher(doc)