Mathematical Activation Functions currently known, and new ones that are becoming commonly used. This page may change on a reasonably regular basis, due to changes in the industry.
Activation functions play a crucial role in neural networks. They are responsible for introducing non-linearity to the model, enabling it to learn and perform complex tasks. Without activation functions, neural networks would essentially be linear models, which limits their capability in solving non-linear problems.
$$f(x) = \arctan(x)$$
public double ArcTan(double x)
{
return Math.Atan(x);
}
$$f(x) = \frac{\sqrt{x^2 + 1} - 1}{2} + x$$
public double BentIdentity(double x)
{
return (Math.Sqrt(x * x + 1) - 1) / 2 + x;
}
$$f(x) = \begin{cases}
0 & \text{if } x < 0 \\
1 & \text{if } x \ge 0
\end{cases}
$$
public double BinaryStep(double x)
{
return x < 0 ? 0 : 1;
}
$$f(x) = \begin{cases}
x & \text{if } x \ge 0 \\
\alpha (e^x - 1) & \text{if } x < 0
\end{cases}
$$
public double ELU(double x, double alpha = 1.0)
{
return x >= 0 ? x : alpha * (Math.Exp(x) - 1);
}
$$f(x) = e^{-x^2}$$
public double Gaussian(double x)
{
return Math.Exp(-x * x);
}
$$f(x) = x$$
public double Identity(double x)
{
return x;
}
$$f(x) = \begin{cases}
x & \text{if } x \ge 0 \\
\alpha x & \text{if } x < 0
\end{cases}
$$
public double LeakyReLU(double x, double alpha = 0.01)
{
return x >= 0 ? x : alpha * x;
}
$$f(x) = \max(w_1^T x + b_1, w_2^T x + b_2)$$
public double Maxout(double[] inputs, double[] weights1, double[] weights2, double bias1, double bias2)
{
double sum1 = inputs.Zip(weights1, (input, weight) => input * weight).Sum() + bias1;
double sum2 = inputs.Zip(weights2, (input, weight) => input * weight).Sum() + bias2;
return Math.Max(sum1, sum2);
}
$$f(x) = x \cdot \tanh(\text{softplus}(x)) = x \cdot \tanh(\ln(1 + e^x))$$
public double Mish(double x)
{
return x * Math.Tanh(Math.Log(1 + Math.Exp(x)));
}
$$f(x) = \max(0, x)$$
public double ReLU(double x)
{
return Math.Max(0, x);
}
$$f(x) = \lambda \begin{cases}
x & \text{if } x \ge 0 \\
\alpha (e^x - 1) & \text{if } x < 0
\end{cases}
$$
public double SELU(double x, double alpha = 1.67326, double lambda = 1.0507)
{
return lambda * (x >= 0 ? x : alpha * (Math.Exp(x) - 1));
}
$$\sigma(x) = \frac{1}{1 + e^{-x}}$$
public double Sigmoid(double x)
{
return 1 / (1 + Math.Exp(-x));
}
$$f(x) = \sin(x)$$
public double Sinusoid(double x)
{
return Math.Sin(x);
}
$$\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}$$
public double[] Softmax(double[] x)
{
double max = x.Max();
double sumExp = x.Select(val => Math.Exp(val - max)).Sum();
return x.Select(val => Math.Exp(val - max) / sumExp).ToArray();
}
$$f(x) = \ln(1 + e^x)$$
public double Softplus(double x)
{
return Math.Log(1 + Math.Exp(x));
}
$$f(x) = \frac{x}{1 + |x|}$$
public double Softsign(double x)
{
return x / (1 + Math.Abs(x));
}
$$f(x) = \begin{cases}
t_1 + a_1(x - t_1) & \text{if } x \le t_1 \\
x & \text{if } t_1 < x < t_2 \\
t_2 + a_2(x - t_2) & \text{if } x \ge t_2
\end{cases}
$$
public double SReLU(double x, double t1, double t2, double a1, double a2)
{
if (x <= t1)
return t1 + a1 * (x - t1);
else if (x < t2)
return x;
else
return t2 + a2 * (x - t2);
}
$$f(x) = \sin(x)$$
public double Sinusoid(double x)
{
return Math.Sin(x);
}
$$\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_{j} e^{x_j}}$$
public double[] Softmax(double[] x)
{
double max = x.Max();
double sumExp = x.Select(val => Math.Exp(val - max)).Sum();
return x.Select(val => Math.Exp(val - max) / sumExp).ToArray();
}
$$f(x) = \ln(1 + e^x)$$
public double Softplus(double x)
{
return Math.Log(1 + Math.Exp(x));
}
$$f(x) = \frac{x}{1 + |x|}$$
public double Softsign(double x)
{
return x / (1 + Math.Abs(x));
}
$$f(x) = x \cdot \sigma(x) = x \cdot \frac{1}{1 + e^{-x}}$$
public double Swish(double x)
{
return x / (1 + Math.Exp(-x));
}
$$\tanh(x) = \frac{e^x - e^{-x}}{e^x + e^{-x}}$$
public double Tanh(double x)
{
return Math.Tanh(x);
}
$$f(x) = x - \tanh(x)$$
public double TanhShrink(double x)
{
return x - Math.Tanh(x);
}
$$f(x) = \max(0, \min(1, 0.2x + 0.5))$$
public double HardSigmoid(double x)
{
return Math.Max(0, Math.Min(1, 0.2 * x + 0.5));
}
$$f(x) = \begin{cases}
-1 & \text{if } x < -1 \\
x & \text{if } -1 \le x \le 1 \\
1 & \text{if } x > 1
\end{cases}
$$
public double HardTanh(double x)
{
return Math.Max(-1, Math.Min(1, x));
}
$$f(x) = \log(\frac{1}{1 + e^{-x}})$$
public double LogSigmoid(double x)
{
return Math.Log(1 / (1 + Math.Exp(-x)));
}
$$f(x) = \begin{cases}
1 & \text{if } x > 2 \\
x - \frac{x^2}{4} & \text{if } 0 \le x \le 2 \\
x + \frac{x^2}{4} & \text{if } -2 \le x < 0 \\
-1 & \text{if } x < -2
\end{cases}
$$
public double SQNL(double x)
{
if (x > 2)
return 1;
else if (x >= 0)
return x - x * x / 4;
else if (x >= -2)
return x + x * x / 4;
else
return -1;
}
$$f(x) = \begin{cases}
x & \text{if } x \ge 0 \\
x / \sqrt{1 + \alpha x^2} & \text{if } x < 0
\end{cases}
$$
public double ISRLU(double x, double alpha = 1.0)
{
return x >= 0 ? x : x / Math.Sqrt(1 + alpha * x * x);
}
$$f(x) = x \cdot \sigma(x) = x \cdot \frac{1}{1 + e^{-x}}$$
Output range**: (-∞, ∞)
Advantages**: Smooth, non-monotonic
Disadvantages**: Computationally expensive
public double SiLU(double x)
{
return x / (1 + Math.Exp(-x));
}
$$f(x) = \begin{cases}
x & \text{if } x \ge 0 \\
\alpha (e^{\frac{x}{\alpha}} - 1) & \text{if } x < 0
\end{cases}
$$
public double CELU(double x, double alpha = 1.0)
{
return x >= 0 ? x : alpha * (Math.Exp(x / alpha) - 1);
}
$$f(x) = \begin{cases}
-1 & \text{if } x < -1 \\
\tanh(x) & \text{if } -1 \le x \le 1 \\
1 & \text{if } x > 1
\end{cases}
$$
Code Example in C#
public double TanhClip(double x)
{
if (x < -1)
return -1;
else if (x > 1)
return 1;
else
return Math.Tanh(x);
}
$$f(x) = \begin{cases}
x & \text{if } x \ge 0 \\
\alpha x & \text{if } x < 0
\end{cases}
$$
public double PReLU(double x, double alpha)
{
return x >= 0 ? x : alpha * x;
}
$$f(x) = x \cdot P(X \leq x) = x \cdot \frac{1}{2}[1 + \text{erf}(\frac{x}{\sqrt{2}})]$$
public double GELU(double x)
{
return 0.5 * x * (1 + Math.Tanh(Math.Sqrt(2 / Math.PI) * (x + 0.044715 * Math.Pow(x, 3))));
}
$$f(x) = \frac{x}{\sqrt{1 + \alpha x^2}}$$
public double RAF(double x, double alpha = 1.0)
{
return x / Math.Sqrt(1 + alpha * x * x);
}
$$f(x) = \begin{cases}
x^n & \text{if } x \ge 0 \\
0 & \text{if } x < 0
\end{cases}
$$
public double RePU(double x)
{
return x >= 0 ? Math.Pow(x, _power) : 0;
}
$$f(x) = \ln(\cosh(x))$$
public double LogCosh(double x)
{
return Math.Log(Math.Cosh(x));
}
$$f(x) = x \cdot \tanh(\text{softplus}(x)) = x \cdot \tanh(\ln(1 + e^x))$$
public double Mish(double x)
{
return x * Math.Tanh(Math.Log(1 + Math.Exp(x)));
}
$$f(x) = \alpha \ln(1 + e^{\beta x})$$
public double ParametricSoftplus(double x, double alpha = 1.0, double beta = 1.0)
{
return alpha * Math.Log(1 + Math.Exp(beta * x));
}
$$f(x) = \min(\max(0, x), 6)$$
public double ReLU6(double x)
{
return Math.Min(Math.Max(0, x), 6);
}
$$f(x) = \lambda \begin{cases}
x & \text{if } x \ge 0 \\
\alpha (e^x - 1) & \text{if } x < 0
\end{cases}
$$
public double SELU(double x, double alpha = 1.67326, double lambda = 1.0507)
{
return lambda * (x >= 0 ? x : alpha * (Math.Exp(x) - 1));
}
$$f(x) = \sqrt{1 + x^2} - 1$$
public double SQ_RBF(double x)
{
return Math.Sqrt(1 + x * x) - 1;
}
$$f(x) = \frac{2}{1 + e^{-x}} - 1$$
public double SymmetricSigmoid(double x)
{
return 2 / (1 + Math.Exp(-x)) - 1;
}
$$f(x) = \tanh(e^x)$$
public double TanhExp(double x)
{
return Math.Tanh(Math.Exp(x));
}
$$f(x) = \begin{cases}
x & \text{if } x > \theta \\
0 & \text{if } x \le \theta
\end{cases}
$$
public double ThresholdedReLU(double x, double theta = 1.0)
{
return x > theta ? x : 0;
}
$$f(x) = \max(0, 1 - |x|)$$
public double Triangular(double x)
{
return Math.Max(0, 1 - Math.Abs(x));
}
$$f(x) = \frac{1 - e^{-x}}{1 + e^{-x}}$$
public double BipolarSigmoid(double x)
{
return (1 - Math.Exp(-x)) / (1 + Math.Exp(-x));
}
Activation functions are the backbone of neural networks, providing the necessary non-linearity that allows the network to model complex data patterns. Each activation function has its strengths and weaknesses, and the choice of which to use can significantly impact the performance and convergence of the neural network. By understanding the mathematical underpinnings and characteristics of these functions, developers and researchers can make more informed decisions in their model architectures.