2 次代码提交 48302defd3 ... 172fc00a92

作者 SHA1 备注 提交日期
  shibing624 172fc00a92 fixed https://github.com/shibing624/textgen/issues/60?notification_referrer_id=NT_kwDOAJxllrQxMjMzMDQ4NTk3ODoxMDI0OTYyMg 1 月之前
  shibing624 7933886000 update gradio. 2 月之前
共有 2 个文件被更改,包括 4 次插入339 次删除
  1. 2 1
      requirements.txt
  2. 2 338
      textgen/custom_models/models.py

+ 2 - 1
requirements.txt

@@ -12,4 +12,5 @@ wandb>=0.10.32
 sacremoses
 Rouge
 cpm_kernels
-peft>=0.3.0
+peft>=0.3.0
+gradio

+ 2 - 338
textgen/custom_models/models.py

@@ -6,42 +6,22 @@
 import copy
 import torch
 from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import BCEWithLogitsLoss
 from transformers import (
     BertModel,
     BertPreTrainedModel,
-    DistilBertModel,
     ElectraForMaskedLM,
     ElectraForPreTraining,
     FlaubertModel,
     LongformerModel,
-    RobertaModel,
     XLMModel,
     XLMPreTrainedModel,
     XLNetModel,
     XLNetPreTrainedModel,
     T5ForConditionalGeneration,
 )
-from transformers.models.camembert.configuration_camembert import CamembertConfig
-from transformers.models.distilbert.configuration_distilbert import DistilBertConfig
-from transformers.models.roberta.configuration_roberta import RobertaConfig
-from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
-from transformers.models.albert.modeling_albert import AlbertConfig, AlbertModel, AlbertPreTrainedModel
-from transformers.models.distilbert.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST
-from transformers.models.electra.modeling_electra import (
-    ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    ElectraConfig,
-    ElectraModel,
-    ElectraPreTrainedModel,
-)
-from transformers.models.camembert.modeling_camembert import CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST
-from transformers.models.roberta.modeling_roberta import (
-    ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
-    RobertaClassificationHead,
-    RobertaForQuestionAnswering,
-)
+from transformers.models.albert.modeling_albert import AlbertModel, AlbertPreTrainedModel
 from transformers.modeling_utils import PreTrainedModel, SequenceSummary
-from transformers.models.xlm_roberta.modeling_xlm_roberta import XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
 from transformers.models.longformer.modeling_longformer import LongformerClassificationHead, LongformerPreTrainedModel
 
 
@@ -87,77 +67,6 @@ class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
 
-class RobertaForMultiLabelSequenceClassification(BertPreTrainedModel):
-    """
-    Roberta model adapted for multi-label sequence classification
-    """
-
-    config_class = RobertaConfig
-    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "roberta"
-
-    def __init__(self, config, pos_weight=None):
-        super(RobertaForMultiLabelSequenceClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-        self.pos_weight = pos_weight
-
-        self.roberta = RobertaModel(config)
-        self.classifier = RobertaClassificationHead(config)
-
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            labels=None,
-    ):
-        outputs = self.roberta(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-        )
-        sequence_output = outputs[0]
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]
-        if labels is not None:
-            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
-            labels = labels.float()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
-            outputs = (loss,) + outputs
-
-        return outputs
-
-
-class BertweetForMultiLabelSequenceClassification(RobertaForMultiLabelSequenceClassification):
-    """
-    BERTweet model adapted for multi-label sequence classification.
-    BERTweet shares the Roberta architecture, so we can reuse the simpletransformers
-    RobertaForMultiLabelSequenceClassification implementation
-    """
-
-    base_model_prefix = "bertweet"
-
-
-class CamembertForMultiLabelSequenceClassification(RobertaForMultiLabelSequenceClassification):
-    """
-    Camembert model adapted for multi-label sequence classification.
-    Camembert shares the Roberta architecture, so we can reuse the simpletransformers
-    RobertaForMultiLabelSequenceClassification implementation, as it is done in
-    the transformers library
-    (https://github.com/huggingface/transformers/blob/master/src/transformers/modeling_camembert.py).
-    """
-
-    config_class = CamembertConfig
-    pretrained_model_archive_map = CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "camembert"
-
-
 class XLNetForMultiLabelSequenceClassification(XLNetPreTrainedModel):
     """
     XLNet model adapted for multi-label sequence classification
@@ -266,69 +175,6 @@ class XLMForMultiLabelSequenceClassification(XLMPreTrainedModel):
         return outputs
 
 
-class DistilBertPreTrainedModel(PreTrainedModel):
-    """ An abstract class to handle weights initialization and
-        a simple interface for downloading and loading pretrained models.
-    """
-
-    config_class = DistilBertConfig
-    pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST
-    load_tf_weights = None
-    base_model_prefix = "distilbert"
-
-    def _init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, nn.Embedding):
-            if module.weight.requires_grad:
-                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
-class DistilBertForMultiLabelSequenceClassification(DistilBertPreTrainedModel):
-    """
-    DistilBert model adapted for multi-label sequence classification
-    """
-
-    def __init__(self, config, pos_weight=None):
-        super(DistilBertForMultiLabelSequenceClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-        self.pos_weight = pos_weight
-
-        self.distilbert = DistilBertModel(config)
-        self.pre_classifier = nn.Linear(config.dim, config.dim)
-        self.classifier = nn.Linear(config.dim, config.num_labels)
-        self.dropout = nn.Dropout(config.seq_classif_dropout)
-
-        self.init_weights()
-
-    def forward(
-            self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None,
-    ):
-        distilbert_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask)
-        hidden_state = distilbert_output[0]  # (bs, seq_len, dim)
-        pooled_output = hidden_state[:, 0]  # (bs, dim)
-        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
-        pooled_output = nn.ReLU()(pooled_output)  # (bs, dim)
-        pooled_output = self.dropout(pooled_output)  # (bs, dim)
-        logits = self.classifier(pooled_output)  # (bs, dim)
-
-        outputs = (logits,) + distilbert_output[1:]
-        if labels is not None:
-            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
-            labels = labels.float()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
-            outputs = (loss,) + outputs
-
-        return outputs
-
-
 class AlbertForMultiLabelSequenceClassification(AlbertPreTrainedModel):
     """
     Alber model adapted for multi-label sequence classification
@@ -484,11 +330,6 @@ class LongformerForMultiLabelSequenceClassification(LongformerPreTrainedModel):
         return outputs
 
 
-class XLMRobertaForMultiLabelSequenceClassification(RobertaForMultiLabelSequenceClassification):
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
 class ElectraPooler(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -563,183 +404,6 @@ class ElectraForLanguageModelingModel(PreTrainedModel):
         return g_loss, d_loss, g_scores, d_scores, d_labels
 
 
-class ElectraForSequenceClassification(ElectraPreTrainedModel):
-    r"""
-    Mostly the ssame as BertForSequenceClassification. A notable difference is that this class contains a pooler while
-    BertForSequenceClassification doesn't. This is because pooling happens internally in a BertModel but not in an
-    ElectraModel.
-        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
-            Labels for computing the sequence classification/regression loss.
-            Indices should be in ``[0, ..., config.num_labels - 1]``.
-            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
-            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
-            Classification (or regression if config.num_labels==1) loss.
-        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-    """  # noqa
-    config_class = ElectraConfig
-    pretrained_model_archive_map = ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "electra"
-
-    def __init__(self, config, weight=None):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.electra = ElectraModel(config)
-        self.pooler = ElectraPooler(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-        self.weight = weight
-
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            labels=None,
-    ):
-
-        outputs = self.electra(input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds)
-        sequence_output = outputs[0]
-        pooled_output = self.pooler(sequence_output)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss(weight=self.weight)
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), logits, (hidden_states), (attentions)
-
-
-class ElectraForMultiLabelSequenceClassification(ElectraPreTrainedModel):
-    """
-    ElectraForSequenceClassification model adapted for multi-label sequence classification
-    """
-
-    config_class = ElectraConfig
-    pretrained_model_archive_map = ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "electra"
-
-    def __init__(self, config, pos_weight=None):
-        super(ElectraForMultiLabelSequenceClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-        self.pos_weight = pos_weight
-
-        self.electra = ElectraModel(config)
-        self.pooler = ElectraPooler(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            labels=None,
-    ):
-        outputs = self.electra(input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds)
-        sequence_output = outputs[0]
-        pooled_output = self.pooler(sequence_output)
-        pooled_output = self.dropout(pooled_output)
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]
-        if labels is not None:
-            loss_fct = BCEWithLogitsLoss(pos_weight=self.pos_weight)
-            labels = labels.float()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
-            outputs = (loss,) + outputs
-
-        return outputs
-
-
-class ElectraForQuestionAnswering(ElectraPreTrainedModel):
-    """
-    Identical to BertForQuestionAnswering other than using an ElectraModel
-    """
-
-    config_class = ElectraConfig
-    pretrained_model_archive_map = ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST
-    base_model_prefix = "electra"
-
-    def __init__(self, config, weight=None):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-
-        self.electra = ElectraModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.init_weights()
-
-    def forward(
-            self,
-            input_ids=None,
-            attention_mask=None,
-            token_type_ids=None,
-            position_ids=None,
-            head_mask=None,
-            inputs_embeds=None,
-            start_positions=None,
-            end_positions=None,
-    ):
-
-        outputs = self.electra(input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds)
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1)
-        end_logits = end_logits.squeeze(-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions.clamp_(0, ignored_index)
-            end_positions.clamp_(0, ignored_index)
-
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-            outputs = (total_loss,) + outputs
-
-        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
-
-
-class XLMRobertaForQuestionAnswering(RobertaForQuestionAnswering):
-    config_class = XLMRobertaConfig
-    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
 class CopyGenerator(nn.Module):
     def __init__(self, config):
         super().__init__()