My small BERT can't even overfit on a sentiment analysis task

Question

I'm trying to train (from scratch) a miniature BERT model on SST2, a simple binary sentiment analysis task with inputs of maybe 5-20 words at a time. As you can see in my code, my approach is a little non-standard in a few ways:

The model is quite small, 4 attention heads of 64 dimensions each and only four transformer layers.
I'm not using a tokenizer, i.e. I'm just using unicode code points as tokens, and restricting the vocabulary to just the characters in this dataset (so my vocabulary is about 50 tokens).

Nevertheless, I'm surprised that my model can't even seem to overfit on the training set. After something like 10 epochs, I see no drop in training loss at all. Should I just train longer? Am I doing something fundamentally wrong?

import datasets
from tokenizers.normalizers import BertNormalizer
from transformers.models.bert import BertForSequenceClassification, BertConfig
import torch
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader
def normalize(sentence):
    normalized = normalizer.normalize_str(sentence)
    normalized = normalized.replace("æ", "ae")
    return normalized
sst2 = datasets.load_dataset("sst2")
normalizer = BertNormalizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=True,
    lowercase=True
)
train_text = sst2["train"]["sentence"]
test_text  = sst2["test"]["sentence"]
characters = set()
max_sentence_length = 0
for sentence in [train_text, test_text]:
    normalized = normalize(sentence)
    characters.update(normalized)
    max_sentence_length = max(max_sentence_length, len(normalized))
character_to_id = {}
for index, character in enumerate(characters):
    character_to_id[character] = index
n_vocabulary = len(characters)
n_heads = 4
dimension = 64
n_layers = 4
config = BertConfig(
    vocab_size=n_vocabulary,
    num_hidden_layers=n_layers,
    num_attention_heads=n_heads,
    hidden_size=n_heads * dimension,
    intermediate_size=256
)
model = BertForSequenceClassification(config).to("cuda")
n_train = len(train_text)
token_ids = torch.empty((n_train, max_sentence_length), dtype=torch.int32)
attention_mask = torch.zeros((n_train, max_sentence_length))
for i in range(len(train_text)):
    normalized = normalize(train_text[i])
    for j, c in enumerate(normalized):
        if j >= max_sentence_length:
            token_ids = torch.cat((token_ids, torch.empty(n_train, 1)), dim=1)
            attention_mask = torch.cat((attention_mask, torch.zeros(n_train, 1)), dim=1)
            max_sentence_length += 1
        token_ids[i, j] = character_to_id[c]
    attention_mask[i, :j + 1] = 1
train_x = token_ids.to("cuda")
train_y = torch.tensor(sst2["train"]["label"]).to("cuda")
train_mask = attention_mask.to("cuda")
batch_size = 128
loader = DataLoader(
    list(zip(train_x, train_y, train_mask)),
    batch_size=batch_size
)
optimizer = Adam(model.parameters())
initial_loss = None
while True:
    for batch_x, batch_y, batch_mask in loader:
        outputs = model(input_ids=batch_x, attention_mask=batch_mask)
        loss = cross_entropy(outputs.logits, batch_y)
        if initial_loss is None:
            initial_loss = float(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"{float(loss):.4} ({100*float(loss)/initial_loss:.2}%)     ", end="\r")
    print("epoch")
```

Regarding 4 Transformers, is "num_hidden_layers" equaling 4? When I see the document, it looks like the number of layer in the Transformer (encoder) not 4 Transformers...(document: https://huggingface.co/docs/transformers/model_doc/bert) — Cloud Cho, Feb 27 '24 at 17:19

score 1 · Accepted Answer · answered Feb 27 '24 at 18:25

One obvious issue with my script is that I never called model.train() (I'm not sure what effect this has with the BERT model). I also found that the batch size can be reduced a little, which speeds things up. However, the real problem seems to be that my BERT model is too big.

Using updated code with the above changes made, I ran an experiment comparing two versions of the model, a larger one (same parameters as above) and a smaller one:

Large model: 4 heads, 64 dimensions per head, FFN dimension of 256 and 4 layers
Small model: 4 heads, 16 dimensions per head, FFN dimension of 16 and 4 layers

Both models are trained with a batch size of 64, and I see the following relative training loss curves (i.e. proportion of mean epoch training loss to initial training loss):

epoch | 1       2       3       4       5       6     
------------------------------------------------------
large | 99.07%  99.03%  98.99%  98.98%  98.98%  98.94%
small | 96.7%   94.8%   93.6%   92.4%   91.5%   90.7%

The large model is definitely converging, but very slowly (and its convergence rate would have seemed even slower with the larger batch size). By the way, the smaller model does generalize too, with about 58% test accuracy after 6 epochs.

import datasets
from tokenizers.normalizers import BertNormalizer
from transformers.models.bert import BertForSequenceClassification, BertConfig
import torch
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader
def normalize(sentence):
    normalized = normalizer.normalize_str(sentence)
    normalized = normalized.replace("æ", "ae")
    return normalized
def preprocess(dataset):
    sentences = dataset["sentence"]
    n_data = len(sentences)
    max_sentence_length = 1
    token_ids = torch.empty((n_data, max_sentence_length), dtype=torch.int32)
    attention_mask = torch.zeros((n_data, max_sentence_length))
    for i in range(len(sentences)):
        normalized = normalize(sentences[i])
        for j, c in enumerate(normalized):
            if j >= max_sentence_length:
                token_ids = torch.cat((token_ids, torch.zeros(n_data, 1, dtype=torch.int32)), dim=1)
                attention_mask = torch.cat((attention_mask, torch.zeros(n_data, 1)), dim=1)
                max_sentence_length += 1
            token_ids[i, j] = character_to_id[c]
        attention_mask[i, :j + 1] = 1
x = token_ids.to(&quot;cuda&quot;)
y = torch.tensor(dataset[&quot;label&quot;]).to(&quot;cuda&quot;)
mask = attention_mask.to(&quot;cuda&quot;)

return x, y, mask


print("Preprocessing data")
sst2 = datasets.load_dataset("sst2")
normalizer = BertNormalizer(
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=True,
    lowercase=True
)
train_text = sst2["train"]["sentence"]
test_text  = sst2["validation"]["sentence"]
characters = set()
max_sentence_length = 0
for sentence in [train_text, test_text]:
    normalized = normalize(sentence)
    characters.update(normalized)
character_to_id = {}
for index, character in enumerate(characters):
    character_to_id[character] = index
train_x, train_y, train_mask = preprocess(sst2["train"])
test_x, test_y, test_mask = preprocess(sst2["validation"])
n_vocabulary = len(characters)
n_heads = 4
head_dimension = 64
ffn_dimension = 256
n_layers = 4
print("Loading model")
print("  n attention heads:", n_heads)
print("  head dimension:", head_dimension)
print("  n layers:", n_layers)
print("  ffn dimension:", ffn_dimension)
config = BertConfig(
    vocab_size=n_vocabulary,
    num_hidden_layers=n_layers,
    num_attention_heads=n_heads,
    hidden_size=n_heads * head_dimension,
    intermediate_size=ffn_dimension
)
model = BertForSequenceClassification(config).to("cuda")
model.train()
batch_size = 64
print("  batch size:", batch_size)
loader = DataLoader(
    list(zip(train_x, train_y, train_mask)),
    batch_size=batch_size
)
print()
n_batches = len(loader)
optimizer = Adam(model.parameters())
initial_loss = None
epoch_number = 1
while True:
    print("epoch", epoch_number)
    batch_index = 0
    mean_loss = 0
    for batch_x, batch_y, batch_mask in loader:
        outputs = model(input_ids=batch_x, attention_mask=batch_mask)
        loss = cross_entropy(outputs.logits, batch_y)
        if initial_loss is None:
            initial_loss = float(loss)
        loss.backward()
        optimizer.step()

        optimizer.zero_grad()
    mean_loss += float(loss) / n_batches

    batch_index += 1
    print(f&quot;  {batch_index}/{n_batches}     &quot;, end=&quot;\r&quot;)

epoch_number += 1

with torch.no_grad():
    outputs = model(input_ids=test_x, attention_mask=test_mask)
predictions = outputs.logits.argmax(dim=1)
test_accuracy = (predictions == test_y).sum() / predictions.shape[0]
print(f&quot;  train loss: {mean_loss:.4}&quot;)
print(f&quot;  relative loss: {100*(mean_loss/initial_loss):.4}%&quot;)
print(f&quot;  test accuracy: {100*test_accuracy:.4}%&quot;)

# print(&quot; &quot;, train_loss)
# print(&quot; &quot;, test_accuracy)

```

My small BERT can't even overfit on a sentiment analysis task

1 Answers1