123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131 |
- # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import paddle
- import paddle.nn as nn
- class ErnieForCSC(nn.Layer):
- r"""
- ErnieForCSC is a model specified for Chinese Spelling Correction task.
- It integrates phonetic features into language model by leveraging the powerful
- pre-training and fine-tuning method.
- See more details on https://aclanthology.org/2021.findings-acl.198.pdf.
- Args:
- ernie (ErnieModel):
- An instance of `paddlenlp.transformers.ErnieModel`.
- pinyin_vocab_size (int):
- The vocab size of pinyin vocab.
- pad_pinyin_id (int, optional):
- The pad token id of pinyin vocab. Defaults to 0.
- """
- def __init__(self, ernie, pinyin_vocab_size, pad_pinyin_id=0):
- super(ErnieForCSC, self).__init__()
- self.ernie = ernie
- emb_size = self.ernie.config["hidden_size"]
- hidden_size = self.ernie.config["hidden_size"]
- vocab_size = self.ernie.config["vocab_size"]
- self.pad_token_id = self.ernie.config["pad_token_id"]
- self.pinyin_vocab_size = pinyin_vocab_size
- self.pad_pinyin_id = pad_pinyin_id
- self.pinyin_embeddings = nn.Embedding(
- self.pinyin_vocab_size, emb_size, padding_idx=pad_pinyin_id)
- self.detection_layer = nn.Linear(hidden_size, 2)
- self.correction_layer = nn.Linear(hidden_size, vocab_size)
- self.softmax = nn.Softmax()
- def forward(self,
- input_ids,
- pinyin_ids,
- token_type_ids=None,
- position_ids=None,
- attention_mask=None):
- r"""
- Args:
- input_ids (Tensor):
- Indices of input sequence tokens in the vocabulary. They are
- numerical representations of tokens that build the input sequence.
- It's data type should be `int64` and has a shape of [batch_size, sequence_length].
- pinyin_ids (Tensor):
- Indices of pinyin tokens of input sequence in the pinyin vocabulary. They are
- numerical representations of tokens that build the pinyin input sequence.
- It's data type should be `int64` and has a shape of [batch_size, sequence_length].
- token_type_ids (Tensor, optional):
- Segment token indices to indicate first and second portions of the inputs.
- Indices can be either 0 or 1:
- - 0 corresponds to a **sentence A** token,
- - 1 corresponds to a **sentence B** token.
- It's data type should be `int64` and has a shape of [batch_size, sequence_length].
- Defaults to None, which means no segment embeddings is added to token embeddings.
- position_ids (Tensor, optional):
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
- config.max_position_embeddings - 1]``.
- Defaults to `None`. Shape as `(batch_sie, num_tokens)` and dtype as `int32` or `int64`.
- attention_mask (Tensor, optional):
- Mask to indicate whether to perform attention on each input token or not.
- The values should be either 0 or 1. The attention scores will be set
- to **-infinity** for any positions in the mask that are **0**, and will be
- **unchanged** for positions that are **1**.
- - **1** for tokens that are **not masked**,
- - **0** for tokens that are **masked**.
- It's data type should be `float32` and has a shape of [batch_size, sequence_length].
- Defaults to `None`.
- Returns:
- detection_error_probs (Tensor):
- A Tensor of the detection probablity of each tokens.
- Shape as `(batch_size, sequence_length, 2)` and dtype as `int`.
- correction_logits (Tensor):
- A Tensor of the correction logits of each tokens.
- Shape as `(batch_size, sequence_length, vocab_size)` and dtype as `int`.
- """
- if attention_mask is None:
- attention_mask = paddle.unsqueeze(
- (input_ids == self.pad_token_id
- ).astype(self.detection_layer.weight.dtype) * -1e9,
- axis=[1, 2])
- embedding_output = self.ernie.embeddings(
- input_ids=input_ids,
- position_ids=position_ids,
- token_type_ids=token_type_ids)
- pinyin_embedding_output = self.pinyin_embeddings(pinyin_ids)
- # Detection module aims to detect whether each Chinese charater has spelling error.
- detection_outputs = self.ernie.encoder(embedding_output, attention_mask)
- # detection_error_probs shape: [B, T, 2]. It indicates the erroneous probablity of each
- # word in the sequence from 0 to 1.
- detection_error_probs = self.softmax(
- self.detection_layer(detection_outputs))
- # Correction module aims to correct each potential wrong charater to right charater.
- word_pinyin_embedding_output = detection_error_probs[:, :, 0:1] * embedding_output \
- + detection_error_probs[:,:, 1:2] * pinyin_embedding_output
- correction_outputs = self.ernie.encoder(word_pinyin_embedding_output,
- attention_mask)
- # correction_logits shape: [B, T, V]. It indicates the correct score of each token in vocab
- # according to each word in the sequence.
- correction_logits = self.correction_layer(correction_outputs)
- return detection_error_probs, correction_logits
|