# Kamangar, Farhad
# 1000_123_456
# 2026_02_08
# Assignment_01_01

import numpy as np

def multi_layer_nn(X_train,Y_train,X_test,Y_test,layers,alpha,epochs,h=0.00001,seed=2):
    # This function creates and trains a multi-layer neural network.
    #
    # Parameters:
    # X_train : numpy.ndarray
    #     Training input data of shape [num_train_samples, input_dimension].
    #
    # Y_train : numpy.ndarray
    #     Desired training outputs of shape [num_train_samples, output_dimension].
    #
    # X_test : numpy.ndarray
    #     Testing input data of shape [num_test_samples, input_dimension].
    #
    # Y_test : numpy.ndarray
    #     Desired testing outputs of shape [num_test_samples, output_dimension].
    #
    # layers : list of int
    #     A list specifying the number of neurons in each layer.
    #
    # alpha : float
    #     Learning rate for gradient descent.
    #
    # epochs : int
    #     Number of training epochs.
    #
    # h : float
    #     Step size used for centered-difference approximation.
    #
    # seed : int
    #     Seed for the random number generator used to initialize weights.
    #
    # Returns:
    # A list with three elements:
    #
    # 1) weights : list of numpy.ndarray
    #    A list of 2D weight matrices, one per layer. Each matrix includes the bias
    #    in its first row.
    #
    # 2) mse_history : numpy.ndarray
    #    A 1D array containing the average mean-squared error (MSE) after each
    #    epoch. The MSE is computed using X_test while the network weights are
    #    frozen (no weight updates during evaluation).
    #
    # 3) Y_pred : numpy.ndarray
    #    A 2D array of shape [num_test_samples, output_dimension] representing
    #    the network output for X_test.
    #
    # Notes:
    # - Do NOT use any external packages other than NumPy.
    # - Bias terms must be incorporated into the weight matrices as the first row.
    # - The net input is computed as: net = X · W, where the bias is included in W.
    # - All layers use the sigmoid activation function except the output layer.
    # - The output layer uses a linear activation function.
    # - Mean-squared error (MSE) is used as the loss function.
    # - Weights are updated using gradient descent:
    #       W = W − alpha * (∂E / ∂W)
    # - Partial derivatives must be computed using the centered-difference method:
    #       (f(x + h) − f(x − h)) / (2h)
    # - Re-seed the random number generator before initializing each layer’s weights:
    #       np.random.seed(seed)
    #       np.random.randn(...)

    pass
