tokenizer 的主要功能可以用 tokenize 和 encode 来描述。tokenize 将文本切分为 token,encode 则将(已切分或未切分的) token 按照词表转换为(或者说映射到)数值 id。tokenizer 的tokenize
和 encode
分别实现上述功能, __call__
方法基本上是 encode,并返回包含更多的控制信息的 BatchEncoding
对象。
由于 token 既可以抽象地指代文本处理中的最小对象(对于 tokenizer 来说,一般就是 word 或 subword),也可以指代具体的字符串形式的(采用字符编码的) 最小对象,本文可能会在这两种意义上模糊地使用 token 这个词,但请留意这一概念上的分歧。
# load a tokenizer
global_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
hamlet = "To be or not to be, that is the question."
# tokenize, encode and decode
result = global_tokenizer.tokenize(hamlet)
print(result)
['to', 'be', 'or', 'not', 'to', 'be', ',', 'that', 'is', 'the', 'question', '.']
result = global_tokenizer.encode(result)
print(result)
[101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102]
result = global_tokenizer.decode(result)
print(repr(result))
'[CLS] to be or not to be, that is the question. [SEP]'
# subword example
hysteria = "a psychoneurosis marked by emotional excitability and disturbances " \
"of the psychogenic, sensory, vasomotor, and visceral functions"
result = global_tokenizer.tokenize(hysteria)
print(result)
['a', 'psycho', '##ne', '##uro', '##sis', 'marked', 'by', 'emotional',
'ex', '##cit', '##ability', 'and', 'disturbances', 'of', 'the', 'psycho',
'##genic', ',', 'sensory', ',', 'va', '##som', '##oto', '##r', ',', 'and',
'vis', '##cera', '##l', 'functions']
# directly encoding is supported
result = global_tokenizer.encode(hamlet)
print(result)
[101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102]
result = global_tokenizer(hamlet)
print(result)
{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# Tokenized text is not properly encoded.
# Every token is treated as an individual sentence.
result = global_tokenizer.tokenize(hamlet)
result = global_tokenizer(result)
print(result)
{'input_ids': [[101, 2000, 102], [101, 2022, 102], [101, 2030, 102], ...
result = global_tokenizer.tokenize(hamlet)
# use of is_split_into_words
result = global_tokenizer(result, is_split_into_words=True)
print(result)
{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# use of padding, truncation and max_length
result = global_tokenizer(hamlet,padding='max_length', max_length=20)
print(result)
{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102, 0, 0, 0, 0, 0, 0],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}
result = global_tokenizer(hamlet, truncation=True, max_length=10)
print(result)
{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 102],
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
# use of return_tensors
result = global_tokenizer(hamlet, return_tensors="pt")
print(result)
{'input_ids': tensor([[ 101, 2000, 2022, 2030, 2025, 2000, 2022, 1010,
2008, 2003, 1996, 3160, 1012, 102]]),
'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
# additional flags to control returned value
result = global_tokenizer(hamlet,
return_token_type_ids=False,
return_attention_mask=False)
print(result)
{'input_ids': [101, 2000, 2022, 2030, 2025, 2000, 2022, 1010, 2008, 2003,
1996, 3160, 1012, 102]}