mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 13:11:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			113 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			113 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| 
 | |
| from ..util import get_doc
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def text():
 | |
|     return """
 | |
| It was a bright cold day in April, and the clocks were striking thirteen.
 | |
| Winston Smith, his chin nuzzled into his breast in an effort to escape the
 | |
| vile wind, slipped quickly through the glass doors of Victory Mansions,
 | |
| though not quickly enough to prevent a swirl of gritty dust from entering
 | |
| along with him.
 | |
| 
 | |
| The hallway smelt of boiled cabbage and old rag mats. At one end of it a
 | |
| coloured poster, too large for indoor display, had been tacked to the wall.
 | |
| It depicted simply an enormous face, more than a metre wide: the face of a
 | |
| man of about forty-five, with a heavy black moustache and ruggedly handsome
 | |
| features. Winston made for the stairs. It was no use trying the lift. Even at
 | |
| the best of times it was seldom working, and at present the electric current
 | |
| was cut off during daylight hours. It was part of the economy drive in
 | |
| preparation for Hate Week. The flat was seven flights up, and Winston, who
 | |
| was thirty-nine and had a varicose ulcer above his right ankle, went slowly,
 | |
| resting several times on the way. On each landing, opposite the lift-shaft,
 | |
| the poster with the enormous face gazed from the wall. It was one of those
 | |
| pictures which are so contrived that the eyes follow you about when you move.
 | |
| BIG BROTHER IS WATCHING YOU, the caption beneath it ran.
 | |
| """
 | |
| 
 | |
| 
 | |
| @pytest.fixture
 | |
| def heads():
 | |
|     # fmt: off
 | |
|     return [1, 1, 0, 3, 2, 1, -4, -1, -1, -7, -8, 1, 2, 1, -12, -1, -2,
 | |
|             -1, 1, 4, 3, 1, 1, 0, -1, 1, -2, -4, 1, -2, 1, -2, 3, -1, 1,
 | |
|             -4, -13, -14, -1, -2, 2, 1, -3, -1, 1, -2, -9, -1, -11, 1, 1, -14,
 | |
|             1, -2, 1, -2, -1, 1, -2, -6, -1, -1, -2, -1, -1, -42, -1, 1, 1,
 | |
|             0, -1, 1, -2, -1, 2, 1, -4, -8, 18, 1, -2, -1, -1, 3, -1, 1, 10,
 | |
|             9, 1, 7, -1, 1, -2, 3, 2, 1, 0, -1, 1, -2, -4, -1, 1, 0, -1,
 | |
|             2, 1, -4, -1, 2, 1, 1, 1, -6, -11, 1, 20, -1, 2, -1, -3, -1,
 | |
|             3, 2, 1, -4, -10, -11, 3, 2, 1, -4, -1, 1, -3, -1, 0, -1, 1, 0,
 | |
|             -1, 1, -2, -4, 1, 0, 1, -2, -1, 1, -2, -6, 1, 9, -1, 1, 6, -1,
 | |
|             -1, 3, 2, 1, 0, -1, -2, 7, -1, 2, 1, 3, -1, 1, -10, -1, -2, 1,
 | |
|             -2, -5, 1, 0, -1, -1, 1, -2, -5, -1, -1, -2, -1, 1, -2, -12, 1,
 | |
|             1, 0, 1, -2, -1, -4, -5, 18, -1, 2, -1, -4, 2, 1, -3, -4, -5, 2,
 | |
|             1, -3, -1, 2, 1, -3, -17, -24, -1, -2, -1, -4, 1, -2, -3, 1, -2,
 | |
|             -10, 17, 1, -2, 14, 13, 3, 2, 1, -4, 8, -1, 1, 5, -1, 2, 1, -3,
 | |
|             0, -1, 1, -2, -4, 1, 0, -1, -1, 2, -1, -3, 1, -2, 1, -2, 3, 1,
 | |
|             1, -4, -1, -2, 2, 1, -3, -19, -1, 1, 1, 0, 0, 6, 5, 1, 3, -1,
 | |
|             -1, 0, -1, -1]
 | |
|     # fmt: on
 | |
| 
 | |
| 
 | |
| def test_parser_parse_navigate_consistency(en_tokenizer, text, heads):
 | |
|     tokens = en_tokenizer(text)
 | |
|     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
 | |
|     for head in doc:
 | |
|         for child in head.lefts:
 | |
|             assert child.head == head
 | |
|         for child in head.rights:
 | |
|             assert child.head == head
 | |
| 
 | |
| 
 | |
| def test_parser_parse_navigate_child_consistency(en_tokenizer, text, heads):
 | |
|     tokens = en_tokenizer(text)
 | |
|     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
 | |
| 
 | |
|     lefts = {}
 | |
|     rights = {}
 | |
|     for head in doc:
 | |
|         assert head.i not in lefts
 | |
|         lefts[head.i] = set()
 | |
|         for left in head.lefts:
 | |
|             lefts[head.i].add(left.i)
 | |
|         assert head.i not in rights
 | |
|         rights[head.i] = set()
 | |
|         for right in head.rights:
 | |
|             rights[head.i].add(right.i)
 | |
|     for head in doc:
 | |
|         assert head.n_rights == len(rights[head.i])
 | |
|         assert head.n_lefts == len(lefts[head.i])
 | |
|     for child in doc:
 | |
|         if child.i < child.head.i:
 | |
|             assert child.i in lefts[child.head.i]
 | |
|             assert child.i not in rights[child.head.i]
 | |
|             lefts[child.head.i].remove(child.i)
 | |
|         elif child.i > child.head.i:
 | |
|             assert child.i in rights[child.head.i]
 | |
|             assert child.i not in lefts[child.head.i]
 | |
|             rights[child.head.i].remove(child.i)
 | |
|     for head_index, children in lefts.items():
 | |
|         assert not children
 | |
|     for head_index, children in rights.items():
 | |
|         assert not children
 | |
| 
 | |
| 
 | |
| def test_parser_parse_navigate_edges(en_tokenizer, text, heads):
 | |
|     tokens = en_tokenizer(text)
 | |
|     doc = get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads)
 | |
|     for token in doc:
 | |
|         subtree = list(token.subtree)
 | |
|         debug = "\t".join((token.text, token.left_edge.text, subtree[0].text))
 | |
|         assert token.left_edge == subtree[0], debug
 | |
|         debug = "\t".join(
 | |
|             (
 | |
|                 token.text,
 | |
|                 token.right_edge.text,
 | |
|                 subtree[-1].text,
 | |
|                 token.right_edge.head.text,
 | |
|             )
 | |
|         )
 | |
|         assert token.right_edge == subtree[-1], debug
 |