import numpy as np #pythran export _Aij(float[:,:], int, int) #pythran export _Aij(int[:,:], int, int) def _Aij(A, i, j): """Sum of upper-left and lower right blocks of contingency table.""" # See `somersd` References [2] bottom of page 309 return A[:i, :j].sum() + A[i+1:, j+1:].sum() #pythran export _Dij(float[:,:], int, int) #pythran export _Dij(int[:,:], int, int) def _Dij(A, i, j): """Sum of lower-left and upper-right blocks of contingency table.""" # See `somersd` References [2] bottom of page 309 return A[i+1:, :j].sum() + A[:i, j+1:].sum() # pythran export _concordant_pairs(float[:,:]) # pythran export _concordant_pairs(int[:,:]) def _concordant_pairs(A): """Twice the number of concordant pairs, excluding ties.""" # See `somersd` References [2] bottom of page 309 m, n = A.shape count = 0 for i in range(m): for j in range(n): count += A[i, j]*_Aij(A, i, j) return count # pythran export _discordant_pairs(float[:,:]) # pythran export _discordant_pairs(int[:,:]) def _discordant_pairs(A): """Twice the number of discordant pairs, excluding ties.""" # See `somersd` References [2] bottom of page 309 m, n = A.shape count = 0 for i in range(m): for j in range(n): count += A[i, j]*_Dij(A, i, j) return count #pythran export _a_ij_Aij_Dij2(float[:,:]) #pythran export _a_ij_Aij_Dij2(int[:,:]) def _a_ij_Aij_Dij2(A): """A term that appears in the ASE of Kendall's tau and Somers' D.""" # See `somersd` References [2] section 4: Modified ASEs to test the null hypothesis... m, n = A.shape count = 0 for i in range(m): for j in range(n): count += A[i, j]*(_Aij(A, i, j) - _Dij(A, i, j))**2 return count #pythran export _compute_outer_prob_inside_method(int64, int64, int64, int64) def _compute_outer_prob_inside_method(m, n, g, h): """ Count the proportion of paths that do not stay strictly inside two diagonal lines. Parameters ---------- m : integer m > 0 n : integer n > 0 g : integer g is greatest common divisor of m and n h : integer 0 <= h <= lcm(m,n) Returns ------- p : float The proportion of paths that do not stay inside the two lines. The classical algorithm counts the integer lattice paths from (0, 0) to (m, n) which satisfy |x/m - y/n| < h / lcm(m, n). The paths make steps of size +1 in either positive x or positive y directions. We are, however, interested in 1 - proportion to computes p-values, so we change the recursion to compute 1 - p directly while staying within the "inside method" a described by Hodges. We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk. Hodges, J.L. Jr., "The Significance Probability of the Smirnov Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86. For the recursion for 1-p see Viehmann, T.: "Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test," arXiv: 2102.08037 """ # Probability is symmetrical in m, n. Computation below uses m >= n. if m < n: m, n = n, m mg = m // g ng = n // g # Count the integer lattice paths from (0, 0) to (m, n) which satisfy # |nx/g - my/g| < h. # Compute matrix A such that: # A(x, 0) = A(0, y) = 1 # A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that # A(x, y) = 0 if |x/m - y/n|>= h # Probability is A(m, n)/binom(m+n, n) # Optimizations exist for m==n, m==n*p. # Only need to preserve a single column of A, and only a # sliding window of it. # minj keeps track of the slide. minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1) curlen = maxj - minj # Make a vector long enough to hold maximum window needed. lenA = min(2 * maxj + 2, n + 1) # This is an integer calculation, but the entries are essentially # binomial coefficients, hence grow quickly. # Scaling after each column is computed avoids dividing by a # large binomial coefficient at the end, but is not sufficient to avoid # the large dyanamic range which appears during the calculation. # Instead we rescale based on the magnitude of the right most term in # the column and keep track of an exponent separately and apply # it at the end of the calculation. Similarly when multiplying by # the binomial coefficient dtype = np.float64 A = np.ones(lenA, dtype=dtype) # Initialize the first column A[minj:maxj] = 0.0 for i in range(1, m + 1): # Generate the next column. # First calculate the sliding window lastminj, lastlen = minj, curlen minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0) minj = min(minj, n) maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1) if maxj <= minj: return 1.0 # Now fill in the values. We cannot use cumsum, unfortunately. val = 0.0 if minj == 0 else 1.0 for jj in range(maxj - minj): j = jj + minj val = (A[jj + minj - lastminj] * i + val * j) / (i + j) A[jj] = val curlen = maxj - minj if lastlen > curlen: # Set some carried-over elements to 1 A[maxj - minj:maxj - minj + (lastlen - curlen)] = 1 return A[maxj - minj - 1]