Skip to content

Commit

Permalink
Use shared workspace for covariance calculation
Browse files Browse the repository at this point in the history
  • Loading branch information
HippocampusGirl committed Sep 3, 2024
1 parent 4227915 commit b4f062e
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 11 deletions.
44 changes: 34 additions & 10 deletions src/gwas/src/gwas/covar.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import warnings

import numpy as np
import pandas as pd
import scipy
from numpy import typing as npt
from upath import UPath

from .compression.arr.base import CompressionMethod, FileArray, FileArrayWriter
Expand All @@ -20,25 +21,46 @@ def calc_covariance(
logger.debug("Skip writing covariance matrix because it already exists")
return path

sw = vc.sw

array = vc.to_numpy()
names = [*vc.phenotype_names, *vc.covariate_names]
names = vc.names
row_count, variable_count = array.shape

logger.debug("Calculating covariance matrix")
count = scipy.linalg.blas.dsyrk(alpha=1.0, a=np.isfinite(array), trans=1)
degrees_of_freedom_array = sw.alloc(
"degrees-of-freedom", variable_count, variable_count
)
degrees_of_freedom = degrees_of_freedom_array.to_numpy()
covariance_array = sw.alloc("covariance", variable_count, variable_count)
covariance = covariance_array.to_numpy()
a_array = sw.alloc("a", row_count, variable_count)
a = a_array.to_numpy()

# Subtract one to get degrees of freedom
degrees_of_freedom = count - 1
# Subtract one from counts to get degrees of freedom
np.isfinite(array, out=a)
scipy.linalg.blas.dsyrk(
alpha=1.0, a=a, trans=1, c=degrees_of_freedom, overwrite_c=True
)
np.subtract(degrees_of_freedom, 1, out=degrees_of_freedom)

# Set lower triangle to 1 to avoid division by zero
x, y = np.tril_indices_from(count, k=-1)
x, y = np.tril_indices_from(degrees_of_freedom, k=-1)
degrees_of_freedom[(x, y)] = 1

a = np.nan_to_num(array - np.nanmean(array, axis=0))
product = scipy.linalg.blas.dsyrk(alpha=1.0, a=a, trans=1)
a[:] = array
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=RuntimeWarning)
a -= np.nanmean(a, axis=0)
a = np.nan_to_num(a, copy=False)

scipy.linalg.blas.dsyrk(alpha=1.0, a=a, trans=1, c=covariance, overwrite_c=True)
a_array.free()

np.divide(covariance, degrees_of_freedom, out=covariance)
degrees_of_freedom_array.free()

covariance: npt.NDArray[np.float64] = product / degrees_of_freedom
covariance[(x, y)] = covariance[(y, x)]
covariance = np.asfortranarray(covariance)

writer: FileArrayWriter[np.float64] = FileArray.create(
path,
Expand All @@ -55,4 +77,6 @@ def calc_covariance(
with writer:
writer[:, :] = covariance

covariance_array.free()

return writer.file_path
4 changes: 4 additions & 0 deletions src/gwas/src/gwas/pheno.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,10 @@ def data_frame(self) -> pd.DataFrame:
def is_finite(self) -> bool:
return bool(np.isfinite(self.to_numpy()).all())

@property
def names(self) -> list[str]:
return self.covariate_names + self.phenotype_names

def __post_init__(self) -> None:
if self.shape[1] != self.covariate_count + self.phenotype_count:
raise ValueError(
Expand Down
11 changes: 10 additions & 1 deletion src/gwas/tests/score/test_pheno.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import sys
from typing import Any

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -234,13 +235,21 @@ def test_covariance(
) as reader:
covariance = reader[:, :]

pandas_covariance = pd.DataFrame(variable_collection.to_numpy()).cov().to_numpy()
kwargs: dict[str, Any] = dict(sep="\t", index_col=0, header=0)
pandas_frame = pd.read_table(str(covariate_path), **kwargs).combine_first(
pd.read_table(str(phenotype_path), **kwargs)
)
pandas_frame.insert(0, "intercept", 1.0)
pandas_covariance_frame = pandas_frame.cov()
pandas_covariance = pandas_covariance_frame.to_numpy()
check_bias(covariance, pandas_covariance)

array = variable_collection.to_numpy()
c = np.ma.array(array, mask=np.isnan(array))
numpy_covariance = np.ma.cov(c, rowvar=False, allow_masked=True).filled(np.nan)
np.testing.assert_allclose(covariance, numpy_covariance)

assert pandas_covariance_frame.columns.tolist() == variable_collection.names

new_allocation_names = {variable_collection.name}
assert set(sw.allocations.keys()) <= (allocation_names | new_allocation_names)

0 comments on commit b4f062e

Please sign in to comment.