0

I'm trying to train (from scratch) a miniature BERT model on SST2, a simple binary sentiment analysis task with inputs of maybe 5-20 words at a time. As you can see in my code, my approach is a little non-standard in a few ways:

  1. The model is quite small, 4 attention heads of 64 dimensions each and only four transformer layers.
  2. I'm not using a tokenizer, i.e. I'm just using unicode code points as tokens, and restricting the vocabulary to just the characters in this dataset (so my vocabulary is about 50 tokens).

Nevertheless, I'm surprised that my model can't even seem to overfit on the training set. After something like 10 epochs, I see no drop in training loss at all. Should I just train longer? Am I doing something fundamentally wrong?

import datasets
from tokenizers.normalizers import BertNormalizer
from transformers.models.bert import BertForSequenceClassification, BertConfig

import torch from torch.nn.functional import cross_entropy from torch.optim import Adam from torch.utils.data import DataLoader

def normalize(sentence): normalized = normalizer.normalize_str(sentence) normalized = normalized.replace("æ", "ae") return normalized

sst2 = datasets.load_dataset("sst2")

normalizer = BertNormalizer( clean_text=False, handle_chinese_chars=False, strip_accents=True, lowercase=True )

train_text = sst2["train"]["sentence"] test_text = sst2["test"]["sentence"] characters = set() max_sentence_length = 0 for sentence in [train_text, test_text]: normalized = normalize(sentence) characters.update(normalized) max_sentence_length = max(max_sentence_length, len(normalized))

character_to_id = {} for index, character in enumerate(characters): character_to_id[character] = index

n_vocabulary = len(characters) n_heads = 4 dimension = 64 n_layers = 4

config = BertConfig( vocab_size=n_vocabulary, num_hidden_layers=n_layers, num_attention_heads=n_heads, hidden_size=n_heads * dimension, intermediate_size=256 ) model = BertForSequenceClassification(config).to("cuda")

n_train = len(train_text) token_ids = torch.empty((n_train, max_sentence_length), dtype=torch.int32) attention_mask = torch.zeros((n_train, max_sentence_length)) for i in range(len(train_text)): normalized = normalize(train_text[i]) for j, c in enumerate(normalized): if j >= max_sentence_length: token_ids = torch.cat((token_ids, torch.empty(n_train, 1)), dim=1) attention_mask = torch.cat((attention_mask, torch.zeros(n_train, 1)), dim=1) max_sentence_length += 1 token_ids[i, j] = character_to_id[c] attention_mask[i, :j + 1] = 1

train_x = token_ids.to("cuda") train_y = torch.tensor(sst2["train"]["label"]).to("cuda") train_mask = attention_mask.to("cuda")

batch_size = 128 loader = DataLoader( list(zip(train_x, train_y, train_mask)), batch_size=batch_size )

optimizer = Adam(model.parameters()) initial_loss = None while True: for batch_x, batch_y, batch_mask in loader: outputs = model(input_ids=batch_x, attention_mask=batch_mask) loss = cross_entropy(outputs.logits, batch_y) if initial_loss is None: initial_loss = float(loss) loss.backward() optimizer.step() optimizer.zero_grad()
print(f"{float(loss):.4} ({100*float(loss)/initial_loss:.2}%) ", end="\r") print("epoch") ```

Jack M
  • 302
  • 2
  • 9
  • How did you decide 10 epochs and 4 Transformers? – Cloud Cho Feb 27 '24 at 17:07
  • Regarding 4 Transformers, is "num_hidden_layers" equaling 4? When I see the document, it looks like the number of layer in the Transformer (encoder) not 4 Transformers...(document: https://huggingface.co/docs/transformers/model_doc/bert) – Cloud Cho Feb 27 '24 at 17:19

1 Answers1

1

One obvious issue with my script is that I never called model.train() (I'm not sure what effect this has with the BERT model). I also found that the batch size can be reduced a little, which speeds things up. However, the real problem seems to be that my BERT model is too big.

Using updated code with the above changes made, I ran an experiment comparing two versions of the model, a larger one (same parameters as above) and a smaller one:

  • Large model: 4 heads, 64 dimensions per head, FFN dimension of 256 and 4 layers
  • Small model: 4 heads, 16 dimensions per head, FFN dimension of 16 and 4 layers

Both models are trained with a batch size of 64, and I see the following relative training loss curves (i.e. proportion of mean epoch training loss to initial training loss):

epoch | 1       2       3       4       5       6     
------------------------------------------------------
large | 99.07%  99.03%  98.99%  98.98%  98.98%  98.94%
small | 96.7%   94.8%   93.6%   92.4%   91.5%   90.7% 

The large model is definitely converging, but very slowly (and its convergence rate would have seemed even slower with the larger batch size). By the way, the smaller model does generalize too, with about 58% test accuracy after 6 epochs.

import datasets
from tokenizers.normalizers import BertNormalizer
from transformers.models.bert import BertForSequenceClassification, BertConfig

import torch from torch.nn.functional import cross_entropy from torch.optim import Adam from torch.utils.data import DataLoader

def normalize(sentence): normalized = normalizer.normalize_str(sentence) normalized = normalized.replace("æ", "ae") return normalized

def preprocess(dataset): sentences = dataset["sentence"] n_data = len(sentences) max_sentence_length = 1 token_ids = torch.empty((n_data, max_sentence_length), dtype=torch.int32) attention_mask = torch.zeros((n_data, max_sentence_length)) for i in range(len(sentences)): normalized = normalize(sentences[i]) for j, c in enumerate(normalized): if j >= max_sentence_length: token_ids = torch.cat((token_ids, torch.zeros(n_data, 1, dtype=torch.int32)), dim=1) attention_mask = torch.cat((attention_mask, torch.zeros(n_data, 1)), dim=1) max_sentence_length += 1 token_ids[i, j] = character_to_id[c] attention_mask[i, :j + 1] = 1

x = token_ids.to("cuda")
y = torch.tensor(dataset["label"]).to("cuda")
mask = attention_mask.to("cuda")

return x, y, mask

print("Preprocessing data")

sst2 = datasets.load_dataset("sst2")

normalizer = BertNormalizer( clean_text=False, handle_chinese_chars=False, strip_accents=True, lowercase=True )

train_text = sst2["train"]["sentence"] test_text = sst2["validation"]["sentence"] characters = set() max_sentence_length = 0 for sentence in [train_text, test_text]: normalized = normalize(sentence) characters.update(normalized)

character_to_id = {} for index, character in enumerate(characters): character_to_id[character] = index

train_x, train_y, train_mask = preprocess(sst2["train"]) test_x, test_y, test_mask = preprocess(sst2["validation"])

n_vocabulary = len(characters) n_heads = 4 head_dimension = 64 ffn_dimension = 256 n_layers = 4

print("Loading model") print(" n attention heads:", n_heads) print(" head dimension:", head_dimension) print(" n layers:", n_layers) print(" ffn dimension:", ffn_dimension)

config = BertConfig( vocab_size=n_vocabulary, num_hidden_layers=n_layers, num_attention_heads=n_heads, hidden_size=n_heads * head_dimension, intermediate_size=ffn_dimension ) model = BertForSequenceClassification(config).to("cuda") model.train()

batch_size = 64 print(" batch size:", batch_size) loader = DataLoader( list(zip(train_x, train_y, train_mask)), batch_size=batch_size ) print()

n_batches = len(loader)

optimizer = Adam(model.parameters()) initial_loss = None epoch_number = 1 while True: print("epoch", epoch_number) batch_index = 0 mean_loss = 0 for batch_x, batch_y, batch_mask in loader: outputs = model(input_ids=batch_x, attention_mask=batch_mask) loss = cross_entropy(outputs.logits, batch_y) if initial_loss is None: initial_loss = float(loss) loss.backward() optimizer.step()
optimizer.zero_grad()

    mean_loss += float(loss) / n_batches

    batch_index += 1
    print(f"  {batch_index}/{n_batches}     ", end="\r")

epoch_number += 1

with torch.no_grad():
    outputs = model(input_ids=test_x, attention_mask=test_mask)
predictions = outputs.logits.argmax(dim=1)
test_accuracy = (predictions == test_y).sum() / predictions.shape[0]
print(f"  train loss: {mean_loss:.4}")
print(f"  relative loss: {100*(mean_loss/initial_loss):.4}%")
print(f"  test accuracy: {100*test_accuracy:.4}%")

# print(" ", train_loss)
# print(" ", test_accuracy)

```

Jack M
  • 302
  • 2
  • 9