# Author: Matt Haberland

cimport cython
import numpy as np
cimport numpy as np
from scipy.linalg import (solve, lu_solve, lu_factor, solve_triangular,
                          LinAlgError)
from scipy.linalg.cython_blas cimport daxpy, dswap
try:
    from time import process_time as timer
except ImportError:
    from time import clock as timer

np.import_array()

__all__ = ['LU', 'BGLU']

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void swap_rows(self, double[:, ::1] H, int i):
    """
    Swaps row i of H with next row; represents matrix product by PI_i
    matrix described after matrix 5.10
    """
    # Python
    # H[[i, i+1]] = H[[i+1, i]]
    # Cython, using BLAS
    cdef int n = H.shape[1]-i
    cdef int one = 1
    dswap(&n, &H[i, i], &one, &H[i+1, i], &one)

@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True) # not really important
cdef double row_subtract(self, double[:, ::1] H, int i):
    """
    Zeros first nonzero element of row i+1 of H by subtracting appropriate
    multiple of row i; represents matrix product by matrix 5.10. Returns
    factor g for storage.
    """
    cdef double g = H[i+1, i]/H[i,i]

    # Python
    # H[i+1, i:] -= g*H[i, i:]
    # Cython, using BLAS
    cdef int n = H.shape[1]-i
    cdef double ng = -g
    cdef int one = 1
    daxpy(&n, &ng, &H[i, i], &one, &H[i+1, i], &one)

    return g

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void hess_lu(self, double[:, ::1] H, int i, double[:,::1] ops):
    """
    Converts Hessenberg matrix H with first nonzero off-diagonal in
    column i to upper triangular, recording elementary row operations.
    That is, performs and records operations in Equation 5.9.
    """
    cdef int m = H.shape[1]
    cdef int j, k
    cdef double piv1, piv2
    cdef double g
    cdef double swap

    for k in range(i, m-1):
        j = k-i
        piv1, piv2 = abs(H[k, k]), abs(H[k+1, k]) # np.abs(H[k:k+2,k])
        swap = float(piv1 < piv2)
        # swap rows to ensure |g| <= 1
        if swap:
            swap_rows(self, H, k)
        g = row_subtract(self, H, k)

        ops[j, 0] = swap
        ops[j, 1] = g

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void perform_ops(self, double[::1] y, double[:,::1] ops, bint rev = False):
    """
    Replays operations needed to convert Hessenberg matrix into upper
    triangular form on a vector y. Equivalent to matrix multlication by
    inverse of matrix 5.12.
    """
    cdef int i, j, k, m
    cdef double g, swap

    m = y.shape[0]
    i = m - ops.shape[0] - 1
    if not rev:
        for k in range(i, m-1):
            j = k - i
            swap = ops[j,0]
            g = ops[j, 1]
            if swap:
                y[k], y[k+1] = y[k+1], y[k]

            y[k+1] -= g*y[k]
    else:
        for k in range(m-2, i-1, -1):
            j = k -i
            swap = ops[j,0]
            g = ops[j, 1]
            y[k] -= g*y[k+1]
            if swap:
                y[k], y[k+1] = y[k+1], y[k]

def _consider_refactor(method):
    """
    This decorator records the time spent in the major BGLU
    routines - refactor, update, and solve - in order to
    calculate the average time required to solve a system.
    It also forces PLU factorization of the basis matrix from
    scratch to minimize the average solve time and to
    accumulation of roundoff error.

    Immediately after PLU factorization, the average solve time
    will be rather high because PLU factorization is slow. For
    some number of factor updates, the average solve time is
    expected to decrease because the updates and solves are fast.
    However, updates increase the compexity of the factorization,
    so solve times are expected to increase with each update.
    When the average solve time stops decreasing and begins
    increasing, we perform PLU factorization from scratch rather
    than updating. PLU factorization is also performed after the
    maximum permitted number of updates is reached to prevent
    further accumulation of roundoff error.
    """
    def f(self, *args, **kwargs):

        refactor_now = False
        out = None

        if method.__name__ == "update":
            # this will get zeroed if we refactor
            self.updates += 1

            # if average solve time is increasing, then it would
            # be faster to refactor from scratch
            slowing_down = (self.average_solve_times[1] >
                            self.average_solve_times[0])

            # if update limit is reached, we should refactor to
            # limit error buildup
            too_many_updates = self.updates >= self.max_updates

            if self.mast:
                refactor_now = (slowing_down or too_many_updates)
            else:
                refactor_now = too_many_updates

            if refactor_now:
                # update basis indices and factor from scratch
                self.update_basis(*args, **kwargs)
                out = self.refactor()  # time will be recorded

        # If refactor_now is True, then self.refactor() is called
        # We don't want to call method = self.update again here
        if not refactor_now:
            # record the time it took to call the method
            t0 = timer()
            out = method(self, *args, **kwargs)
            if isinstance(out, np.ndarray) and np.any(np.isnan(out)):
                raise LinAlgError("Nans in output")
            t1 = timer()
            self.bglu_time += (t1-t0)

        # calculate average solve time,
        # considering all significant method calls
        if method.__name__ == "solve":
            self.solves += 1
            avg = self.bglu_time/self.solves
            self.average_solve_times = [
                self.average_solve_times[1], avg]

        return out
    return f


cdef class LU:
    """
    Represents PLU factorization of a basis matrix with naive rank-one updates
    """

    cdef public np.ndarray A
    cdef public np.ndarray b
    cdef public np.ndarray B
    cdef public int m
    cdef public int n

    def __init__(self, A, b):
        """ Given matrix A and basis indices b, form basis matrix B """
        self.A = A
        self.b = b
        self.B = A[:, b]
        self.m, self.n = A.shape

    def update(self, i, j):
        """ Rank-one update to basis and basis matrix """
        self.b[i:self.m-1] = self.b[i+1:self.m]
        self.b[-1] = j
#        self.b[i] = j
        self.B = self.A[:, self.b]

    def solve(self, q, transposed = False):
        """
        Solve B @ v = q
        """
        v = solve(self.B, q, transposed=transposed)
        return v


cdef class BGLU(LU):
    """
    Represents PLU factorization with Golub rank-one updates from
    Bartels, Richard H. "A stabilization of the simplex method."
    Numerische Mathematik 16.5 (1971): 414-434.
    """

    cdef public tuple plu
    cdef public np.ndarray L
    cdef public np.ndarray U
    cdef public np.ndarray pi
    cdef public np.ndarray pit
    cdef public list ops_list
    cdef public double bglu_time
    cdef public int solves
    cdef public int updates
    cdef public int max_updates
    cdef public list average_solve_times
    cdef public bint mast

    def __init__(self, A, b, max_updates=10, mast=False):
        """
        Given matrix A and basis indices b, perform PLU factorization of
        basis matrix B
        """
        self.A = A
        self.b = b
        self.m, self.n = A.shape
        self.max_updates = max_updates  # maximum updates between refactor
        self.refactor()
        self.mast = mast

    @_consider_refactor
    def refactor(self):
        # Factor as in Equation 5.1
        self.B = self.A[:, self.b]  # get basis matrix
        self.plu = lu_factor(self.B)  # lu_factor tested faster than lu
        self.L = self.plu[0]  # L and U stored in the same matrix
        self.U = self.plu[0].copy()  # need to modify without changing L
        # indexing with self.pi is equivalent to PI matrix product
        self.pi = self.perform_perm(self.plu[1])  # permutation indices
        self.pit = np.zeros(self.m, dtype=int)  # permutation transposed
        self.pit[self.pi] = np.arange(self.m)
        self.ops_list = []  # elementary row operations in order

        self.bglu_time = 0  # cumulative time spent updating and solving
        self.solves = 0     # number of solves since refactoring
        self.updates = 0    # number of updates since refactoring
        self.average_solve_times = [np.inf, np.inf]  # current and last average solve time

    # ideally should time this, too, but I also want to call this
    # method in update below, which would double-count the time.
    def update_basis(self, i, j):
        self.b[i:self.m-1] = self.b[i+1:self.m]  # eliminate i from basis
        self.b[-1] = j  # add j to end of basis

    @_consider_refactor
    def update(self, i, j):
        """ Perform rank-one update to basis and factorization """
        self.update_basis(i, j)

        # calculate last column of Hessenberg matrix
        # TODO: share this calculation with simplex method
        pla = self.A[self.pi, j]
        um = solve_triangular(self.L, pla, lower = True,
                              check_finite=False, unit_diagonal=True)
        for ops in self.ops_list:
            perform_ops(self, um, ops) # modifies um in place

        # form Hessenberg matrix
        H = self.U
        H[:, i:self.m-1] = self.U[:, i+1:self.m]  # eliminate column i
        H[:, -1] = um  # add column corresponding with j

        # convert H to upper triangular, recording elementary row operations
        ops = np.zeros((self.m-1-i, 2))
        hess_lu(self, H, i, ops) # hess_lu modifies ops in place
        self.ops_list.append(ops)

        self.U = H

    @_consider_refactor
    def solve(self, q, transposed = False):
        """
        Solve B @ v = q efficiently using factorization
        """
        if not self.ops_list:
            # before any updates, solve according to Equation 5.2
            v = lu_solve(self.plu, q, trans=transposed)
        else:
            if not transposed:
                q = q[self.pi]  # paper skips this by making
                                # "inessential assumption" of no permutation

                # Equation 5.16
                t = solve_triangular(self.L, q, lower = True,
                                     check_finite=False, unit_diagonal=True)

                # Equation 5.17
                temp = t
                for ops in self.ops_list:
                    perform_ops(self, temp, ops) # modifies temp in place
                w = temp

                # Equation 5.18
                # Faster to use U.T and set trans=True due to array order
                v = solve_triangular(self.U.T, w, lower=True,
                                     trans=True, check_finite=False)

            else: # do everything transposed and in reverse order
                t = solve_triangular(self.U.T, q, lower=True,
                                     trans=False, check_finite=False)
                temp = t
                for ops in reversed(self.ops_list):
                    perform_ops(self, temp, ops, rev = True) # mod in place
                w = temp
                v = solve_triangular(self.L, w, lower = True, trans=True,
                                     check_finite=False, unit_diagonal=True)
                v = v[self.pit]

        return v

    def perform_perm(self, p):
        """
        Perform individual row swaps defined in p returned by factor_lu to
        generate final permutation indices pi
        """
        pi = np.arange(len(p))
        for i, row in enumerate(p):
            pi[i], pi[row] = pi[row], pi[i]
        return pi