Transformer最早在2017年NIPS的这篇经典论文《Attention Is All You Need》中被提出,这个时候的Transformer还是一个用于机器翻译的Encoder-Decoder架构的模型,它的特点在于抛弃了当时主流的CNN和RNN架构,不使用卷积和循环单元,直接使用自注意力机制+前馈网络进行特征的处理,而Transformer的Encoder部分后来也被发扬光大,形成了一大批以Transformer Encoder为backbone的预训练模型,最经典的比如BERT,而CS182的这个作业中,我们要实现的就是原本的Transformer架构,并用它来完成文档摘要的任务。
for size in feature_sizes: assert size % self.n_heads == 0, 'Shape of feature input must be divisible by n_heads'
defforward(self, inputs, mask=None): queries, keys, values = inputs # Split each of the projection into its heads, by adding a new dimension # You must implement _split_heads, and _combine_heads queries_split = self._split_heads(queries) keys_split = self._split_heads(keys) values_split = self._split_heads(values) # Apply the attention map attention_output_split, _ = self.attention_map(queries_split, keys_split, values_split, mask=mask) # Re-combine the heads together, and return the output. output = self._combine_heads(attention_output_split) return output
def_split_heads(self, tensor): assertlen(tensor.shape) == 3 batch_size, tensorlen = tensor.shape[0], tensor.shape[1] feature_size = tensor.shape[2] # Compute what the feature size per head is. new_feature_size = feature_size // self.n_heads # Reshape this projection tensor so that it has n_heads, each of new_feature_size tensor = tensor.reshape(batch_size, tensorlen, self.n_heads, new_feature_size) # Transpose the matrix so the outer-dimensions are the batch-size and the number of heads tensor = tensor.transpose(1, 2) return tensor
# What is the new feature size, if we combine all the heads new_feature_size = feature_size * self.n_heads # Reshape the Tensor to remove the heads dimension and come back to a Rank-3 tensor tensor = tensor.reshape(batch_size, tensorlen, new_feature_size) return tensor
# create the position embedding as described in the paper # use the `divisor` attribute instantiated in __init__ sin_embedding = th.sin(index / self.divisor) cos_embedding = th.cos(index / self.divisor) position_shape = (1, sequence_length, self.hidden_size) # fill in the other two dimensions position_embedding = th.stack((sin_embedding, cos_embedding), dim=3).view(position_shape) pos_embed_deviced = position_embedding.to(get_device()) # add the embedding to the input return inputs + pos_embed_deviced
classTransformerDecoderBlock(nn.Module): """A decoding block from the paper Attention Is All You Need (https://arxiv.org/pdf/1706.03762.pdf). :param inputs: two Tensors encoder_outputs, decoder_inputs encoder_outputs -> a Tensor with shape [batch_size, sequence_length, channels] decoder_inputs -> a Tensor with shape [batch_size, decoding_sequence_length, channels] :return: output: Tensor with same shape as decoder_inputs """
defforward(self,teacher_out,student_out,penalize_prediction=False): embedding_loss = self.embedding_loss(teacher_out['embeddings'],student_out['embeddings']) # apply the loss from each attention and hidden layer based on the layer mapping attention_loss = 0 hidden_loss = 0 for st_i,te_i inenumerate(self.layer_mapping): attn_fn = self.__getattr__(f"attention_loss{st_i}") attention_loss += attn_fn( teacher_out['attentions'][te_i], student_out['attentions'][st_i] ) hddn_fn = self.__getattr__(f"hidden_loss{st_i}") hidden_loss += hddn_fn( teacher_out['hidden_states'][te_i], student_out['hidden_states'][st_i] ) # sum up the loss for each layer loss = embedding_loss + attention_loss + hidden_loss # apply the prediction penalty during task distillation if penalize_prediction: prediction_loss = self.prediction_loss( teacher_out['logits'], student_out['logits'] ) loss += prediction_loss return