Use shared workspace for covariance calculation

HALFpipe · Sep 3, 2024 · b4f062e · b4f062e
1 parent 4227915
commit b4f062e
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 11 deletions.
diff --git a/src/gwas/src/gwas/covar.py b/src/gwas/src/gwas/covar.py
@@ -1,7 +1,8 @@
+import warnings
+
 import numpy as np
 import pandas as pd
 import scipy
-from numpy import typing as npt
 from upath import UPath
 
 from .compression.arr.base import CompressionMethod, FileArray, FileArrayWriter
@@ -20,25 +21,46 @@ def calc_covariance(
         logger.debug("Skip writing covariance matrix because it already exists")
         return path
 
+    sw = vc.sw
+
     array = vc.to_numpy()
-    names = [*vc.phenotype_names, *vc.covariate_names]
+    names = vc.names
+    row_count, variable_count = array.shape
 
     logger.debug("Calculating covariance matrix")
-    count = scipy.linalg.blas.dsyrk(alpha=1.0, a=np.isfinite(array), trans=1)
+    degrees_of_freedom_array = sw.alloc(
+        "degrees-of-freedom", variable_count, variable_count
+    )
+    degrees_of_freedom = degrees_of_freedom_array.to_numpy()
+    covariance_array = sw.alloc("covariance", variable_count, variable_count)
+    covariance = covariance_array.to_numpy()
+    a_array = sw.alloc("a", row_count, variable_count)
+    a = a_array.to_numpy()
 
-    # Subtract one to get degrees of freedom
-    degrees_of_freedom = count - 1
+    # Subtract one from counts to get degrees of freedom
+    np.isfinite(array, out=a)
+    scipy.linalg.blas.dsyrk(
+        alpha=1.0, a=a, trans=1, c=degrees_of_freedom, overwrite_c=True
+    )
+    np.subtract(degrees_of_freedom, 1, out=degrees_of_freedom)
 
     # Set lower triangle to 1 to avoid division by zero
-    x, y = np.tril_indices_from(count, k=-1)
+    x, y = np.tril_indices_from(degrees_of_freedom, k=-1)
     degrees_of_freedom[(x, y)] = 1
 
-    a = np.nan_to_num(array - np.nanmean(array, axis=0))
-    product = scipy.linalg.blas.dsyrk(alpha=1.0, a=a, trans=1)
+    a[:] = array
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=RuntimeWarning)
+        a -= np.nanmean(a, axis=0)
+    a = np.nan_to_num(a, copy=False)
+
+    scipy.linalg.blas.dsyrk(alpha=1.0, a=a, trans=1, c=covariance, overwrite_c=True)
+    a_array.free()
+
+    np.divide(covariance, degrees_of_freedom, out=covariance)
+    degrees_of_freedom_array.free()
 
-    covariance: npt.NDArray[np.float64] = product / degrees_of_freedom
     covariance[(x, y)] = covariance[(y, x)]
-    covariance = np.asfortranarray(covariance)
 
     writer: FileArrayWriter[np.float64] = FileArray.create(
         path,
@@ -55,4 +77,6 @@ def calc_covariance(
     with writer:
         writer[:, :] = covariance
 
+    covariance_array.free()
+
     return writer.file_path
diff --git a/src/gwas/src/gwas/pheno.py b/src/gwas/src/gwas/pheno.py
@@ -100,6 +100,10 @@ def data_frame(self) -> pd.DataFrame:
     def is_finite(self) -> bool:
         return bool(np.isfinite(self.to_numpy()).all())
 
+    @property
+    def names(self) -> list[str]:
+        return self.covariate_names + self.phenotype_names
+
     def __post_init__(self) -> None:
         if self.shape[1] != self.covariate_count + self.phenotype_count:
             raise ValueError(

diff --git a/src/gwas/tests/score/test_pheno.py b/src/gwas/tests/score/test_pheno.py
@@ -1,4 +1,5 @@
 import sys
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -234,13 +235,21 @@ def test_covariance(
     ) as reader:
         covariance = reader[:, :]
 
-    pandas_covariance = pd.DataFrame(variable_collection.to_numpy()).cov().to_numpy()
+    kwargs: dict[str, Any] = dict(sep="\t", index_col=0, header=0)
+    pandas_frame = pd.read_table(str(covariate_path), **kwargs).combine_first(
+        pd.read_table(str(phenotype_path), **kwargs)
+    )
+    pandas_frame.insert(0, "intercept", 1.0)
+    pandas_covariance_frame = pandas_frame.cov()
+    pandas_covariance = pandas_covariance_frame.to_numpy()
     check_bias(covariance, pandas_covariance)
 
     array = variable_collection.to_numpy()
     c = np.ma.array(array, mask=np.isnan(array))
     numpy_covariance = np.ma.cov(c, rowvar=False, allow_masked=True).filled(np.nan)
     np.testing.assert_allclose(covariance, numpy_covariance)
 
+    assert pandas_covariance_frame.columns.tolist() == variable_collection.names
+
     new_allocation_names = {variable_collection.name}
     assert set(sw.allocations.keys()) <= (allocation_names | new_allocation_names)