I'm trying to train (from scratch) a miniature BERT model on SST2, a simple binary sentiment analysis task with inputs of maybe 5-20 words at a time. As you can see in my code, my approach is a little non-standard in a few ways:
- The model is quite small, 4 attention heads of 64 dimensions each and only four transformer layers.
- I'm not using a tokenizer, i.e. I'm just using unicode code points as tokens, and restricting the vocabulary to just the characters in this dataset (so my vocabulary is about 50 tokens).
Nevertheless, I'm surprised that my model can't even seem to overfit on the training set. After something like 10 epochs, I see no drop in training loss at all. Should I just train longer? Am I doing something fundamentally wrong?
import datasets
from tokenizers.normalizers import BertNormalizer
from transformers.models.bert import BertForSequenceClassification, BertConfig
import torch
from torch.nn.functional import cross_entropy
from torch.optim import Adam
from torch.utils.data import DataLoader
def normalize(sentence):
normalized = normalizer.normalize_str(sentence)
normalized = normalized.replace("æ", "ae")
return normalized
sst2 = datasets.load_dataset("sst2")
normalizer = BertNormalizer(
clean_text=False,
handle_chinese_chars=False,
strip_accents=True,
lowercase=True
)
train_text = sst2["train"]["sentence"]
test_text = sst2["test"]["sentence"]
characters = set()
max_sentence_length = 0
for sentence in [train_text, test_text]:
normalized = normalize(sentence)
characters.update(normalized)
max_sentence_length = max(max_sentence_length, len(normalized))
character_to_id = {}
for index, character in enumerate(characters):
character_to_id[character] = index
n_vocabulary = len(characters)
n_heads = 4
dimension = 64
n_layers = 4
config = BertConfig(
vocab_size=n_vocabulary,
num_hidden_layers=n_layers,
num_attention_heads=n_heads,
hidden_size=n_heads * dimension,
intermediate_size=256
)
model = BertForSequenceClassification(config).to("cuda")
n_train = len(train_text)
token_ids = torch.empty((n_train, max_sentence_length), dtype=torch.int32)
attention_mask = torch.zeros((n_train, max_sentence_length))
for i in range(len(train_text)):
normalized = normalize(train_text[i])
for j, c in enumerate(normalized):
if j >= max_sentence_length:
token_ids = torch.cat((token_ids, torch.empty(n_train, 1)), dim=1)
attention_mask = torch.cat((attention_mask, torch.zeros(n_train, 1)), dim=1)
max_sentence_length += 1
token_ids[i, j] = character_to_id[c]
attention_mask[i, :j + 1] = 1
train_x = token_ids.to("cuda")
train_y = torch.tensor(sst2["train"]["label"]).to("cuda")
train_mask = attention_mask.to("cuda")
batch_size = 128
loader = DataLoader(
list(zip(train_x, train_y, train_mask)),
batch_size=batch_size
)
optimizer = Adam(model.parameters())
initial_loss = None
while True:
for batch_x, batch_y, batch_mask in loader:
outputs = model(input_ids=batch_x, attention_mask=batch_mask)
loss = cross_entropy(outputs.logits, batch_y)
if initial_loss is None:
initial_loss = float(loss)
loss.backward()
optimizer.step()
optimizer.zero_grad()
print(f"{float(loss):.4} ({100*float(loss)/initial_loss:.2}%) ", end="\r")
print("epoch")
```