I'm not really sure if this is the sort of question to ask on here, since it is less of a general question about AI and more about the coding of it, however I thought it wouldn't fit on stack overflow.
I have been programming a multilayer perceptron in c++, and it seems to be working with a sigmoid function, however when I change the activation function to ReLU it does not converge and stays at an average cost of 1 per training example. this is because all of the network's output neurons output a 0.
With the sigmoid function it converges rather nicely, I did a bit of testing and after about 1000 generations it got to an average cost of 0.1 on the first 1000 items in the MNIST dataset.
I will show you the code I changes first for the activation functions, and then i will put the whole block of code in.
Any help would be greatly appreciated!
Sigmoid:
inline float activation(float num)
{
    return 1 / (1 + std::exp(-num));
}
inline float activation_derivative(float num)
{
    return activation(num) * (1 - activation(num));
}
ReLU:
inline float activation(float num)
{
    return std::max(num, 0.0f);
}
inline float activation_derivative(float num)
{
    return num > 0 ? 1.0f : 0.0f;
}
And here's the whole block of code (I collapsed the region of code for benchmarking and the region for creating the dataset):
#include <iostream>
#include <fstream>
#include <vector>
#include <random>
#include <chrono>
#include <cmath>
#include <string>
#include <algorithm>
#pragma region benchmarking
#pragma endregion
class Network
{
public:
    float cost = 0.0f;
    std::vector<std::vector<std::vector<float>>> weights;
    std::vector<std::vector<std::vector<float>>> deriv_weights;
    std::vector<std::vector<float>> biases;
    std::vector<std::vector<float>> deriv_biases;
    std::vector<std::vector<float>> activations;
    std::vector<std::vector<float>> deriv_activations;
    void clear_deriv_activations()
    {
        for (unsigned int i = 0; i < deriv_activations.size(); ++i)
        {
            std::fill(deriv_activations[i].begin(), deriv_activations[i].end(), 0.0f);
        }
    }
    int get_memory_usage()
    {
        int memory = 4;
        memory += get_vector_memory_usage(weights);
        memory += get_vector_memory_usage(deriv_weights);
        memory += get_vector_memory_usage(biases);
        memory += get_vector_memory_usage(deriv_biases);
        memory += get_vector_memory_usage(activations);
        memory += get_vector_memory_usage(deriv_activations);
        return memory;
    }
};
struct DataSet
{
    std::vector<std::vector<float>> training_inputs;
    std::vector<std::vector<float>> training_answers;
    std::vector<std::vector<float>> testing_inputs;
    std::vector<std::vector<float>> testing_answers;
};
Network create_network(std::vector<int> layers)
{
    Network network;
    int layer_count = layers.size() - 1;
    network.weights.reserve(layer_count);
    network.deriv_weights.reserve(layer_count);
    network.biases.reserve(layer_count);
    network.deriv_biases.reserve(layer_count);
    network.activations.reserve(layer_count);
    network.deriv_activations.reserve(layer_count);
    int nodes_in_prev_layer = layers[0];
    for (unsigned int i = 0; i < layers.size() - 1; ++i)
    {
        int nodes_in_layer = layers[i + 1];
        network.weights.emplace_back();
        network.weights[i].reserve(nodes_in_layer);
        network.deriv_weights.emplace_back();
        network.deriv_weights[i].reserve(nodes_in_layer);
        network.biases.emplace_back();
        network.biases[i].reserve(nodes_in_layer);
        network.deriv_biases.emplace_back(nodes_in_layer, 0.0f);
        network.activations.emplace_back(nodes_in_layer, 0.0f);
        network.deriv_activations.emplace_back(nodes_in_layer, 0.0f);
        for (int j = 0; j < nodes_in_layer; ++j)
        {
            network.weights[i].emplace_back();
            network.weights[i][j].reserve(nodes_in_prev_layer);
            network.deriv_weights[i].emplace_back(nodes_in_prev_layer, 0.0f);
            for (int k = 0; k < nodes_in_prev_layer; ++k)
            {
                float input_weight = (2 * (float(std::rand()) / RAND_MAX)) - 1; 
                network.weights[i][j].push_back(input_weight);
            }
            float input_bias = (2 * (float(std::rand()) / RAND_MAX)) - 1;
            network.biases[i].push_back(input_bias);
        }
        nodes_in_prev_layer = nodes_in_layer;
    }
    return network;
}
void judge_network(Network &network, const std::vector<float>& correct_answers)
{
    int final_layer_index = network.activations.size() - 1;
    for (unsigned int i = 0; i < network.activations[final_layer_index].size(); ++i)
    {
        float val_sq = (network.activations[final_layer_index][i] - correct_answers[i]);
        network.cost += val_sq * val_sq;
    }
}
inline float activation(float num)
{
    return std::max(num, 0.0f);
}
void forward_propogate(Network& network, const std::vector<float>& input)
{
    const std::vector<float>* last_layer_activations = &input;
    int last_layer_node_count = input.size();
    for (unsigned int i = 0; i < network.weights.size(); ++i)
    {
        for (unsigned int j = 0; j < network.weights[i].size(); ++j)
        {
            float total = network.biases[i][j];
            for (int k = 0; k < last_layer_node_count; ++k)
            {
                total +=  (*last_layer_activations)[k] * network.weights[i][j][k];
            }
            network.activations[i][j] = activation(total);
        }
        last_layer_activations = &network.activations[i];
        last_layer_node_count = network.weights[i].size();
    }
}
void final_layer_deriv_activations(Network& network, const std::vector<float>& correct_answers)
{
    int final_layer_index = network.activations.size() - 1;
    int final_layer_node_count = network.activations[final_layer_index].size();
    for (int i = 0; i < final_layer_node_count; ++i)
    {
        float deriv = network.activations[final_layer_index][i] - correct_answers[i];
        network.deriv_activations[final_layer_index][i] = deriv * 2;
    }
}
inline float activation_derivative(float num)
{
    return num > 0 ? 1.0f : 0.0f;
}
void back_propogate_layer(Network& network, int layer)
{
    int nodes_in_layer = network.activations[layer].size();
    int nodes_in_prev_layer = network.activations[layer - 1].size();
    for (int i = 0; i < nodes_in_layer; ++i)
    {
        float total = network.biases[layer][i];
        for (int j = 0; j < nodes_in_prev_layer; ++j)
        {
            total += network.weights[layer][i][j] * network.activations[layer - 1][j];
        }
        float dzda = activation_derivative(total);
        float dzdc = dzda * network.deriv_activations[layer][i];
        for (int j = 0; j < nodes_in_prev_layer; ++j)
        {
            network.deriv_weights[layer][i][j] += network.activations[layer - 1][j] * dzdc;
            network.deriv_activations[layer - 1][j] += network.weights[layer][i][j] * dzdc;
        }
        network.deriv_biases[layer][i] += dzdc;
    }
}
void back_propogate_first_layer(Network& network, std::vector<float> inputs)
{
    int nodes_in_layer = network.activations[0].size();
    int input_count = inputs.size();
    for (int i = 0; i < nodes_in_layer; ++i)
    {
        float total = network.biases[0][i];
        for (int j = 0; j < input_count; ++j)
        {
            total += network.weights[0][i][j] * inputs[j];
        }
        float dzda = activation_derivative(total);
        float dzdc = dzda * network.deriv_activations[0][i];
        for (int j = 0; j < input_count; ++j)
        {
            network.deriv_weights[0][i][j] += inputs[j] * dzdc;
        }
        network.deriv_biases[0][i] += dzdc;
    }
}
void back_propogate(Network& network, const std::vector<float>& inputs, const std::vector<float>& correct_answers)
{
    network.clear_deriv_activations();
    final_layer_deriv_activations(network, correct_answers);
    for (int i = network.activations.size() - 1; i > 0; --i)
    {
        back_propogate_layer(network, i);
    }
    back_propogate_first_layer(network, inputs);
}
void apply_derivatives(Network& network, int training_example_count)
{
    for (unsigned int i = 0; i < network.weights.size(); ++i)
    {
        for (unsigned int j = 0; j < network.weights[i].size(); ++j)
        {
            for (unsigned int k = 0; k < network.weights[i][j].size(); ++k)
            {
                network.weights[i][j][k] -= network.deriv_weights[i][j][k] / training_example_count;
                network.deriv_weights[i][j][k] = 0;
            }
            network.biases[i][j] -= network.deriv_biases[i][j] / training_example_count;
            network.deriv_biases[i][j] = 0;
            network.deriv_activations[i][j] = 0;
        }
    }
}
void training_iteration(Network& network, const DataSet& data)
{
    int training_example_count = data.training_inputs.size();
    for (int i = 0; i < training_example_count; ++i)
    {
        forward_propogate(network, data.training_inputs[i]);
        judge_network(network, data.training_answers[i]);
        back_propogate(network, data.training_inputs[i], data.training_answers[i]);
    }
    apply_derivatives(network, training_example_count);
}
void train_network(Network& network, const DataSet& dataset, int training_iterations)
{
    for (int i = 0; i < training_iterations; ++i)
    {
        training_iteration(network, dataset);
        std::cout << "Generation " << i << ": " << network.cost << std::endl;
        network.cost = 0.0f;
    }
}
#pragma region dataset creation
#pragma endregion
int main() 
{
    Timer timer;
    DataSet dataset = create_dataset_from_file("data.txt");
    Network network = create_network({784, 128, 10});
    train_network(network, dataset, 1000);
    std::cout << timer.get_duration() << std::endl;
    std::cin.get();
}
```
 
    