improving export grid performance

scarrazza · Apr 24, 2015 · 36cc689 · 36cc689
1 parent feed362
commit 36cc689
Show file tree

Hide file tree

Showing 4 changed files with 276 additions and 423 deletions.
diff --git a/common.py b/common.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python
+""" A Monte Carlo to Hessian conversion tool """
+
+__author__ = 'Stefano Carrazza'
+__license__ = 'GPL'
+__version__ = '1.0.0'
+__email__ = '[email protected]'
+
+
+import numpy
+import lhapdf
+from numba import jit
+
+class LocalPDF:
+    """ A simple class for PDF manipulation """
+    def __init__(self, pdf_name, nrep, xgrid, fl, Q, eps):
+        self.pdf_name = pdf_name
+        self.pdf = lhapdf.mkPDFs(pdf_name)
+        self.n = nrep
+        self.n_rep = len(self.pdf)-1
+        self.fl = fl
+        self.xgrid = xgrid
+        self.Q = Q
+        self.xfxQ = numpy.zeros(shape=(self.n_rep, fl.n, xgrid.n))
+        self.base = numpy.zeros(shape=self.n_rep, dtype=numpy.int64)
+        self.fin = numpy.zeros(shape=self.n, dtype=numpy.int64)
+
+        for i in range(self.n_rep): self.base[i] = i+1
+        for i in range(self.n): self.fin[i] = i+1
+
+        # precomputing values
+        for r in range(self.n_rep):
+            for f in range(fl.n):
+                for ix in range(xgrid.n):
+                    self.xfxQ[r, f, ix] = self.pdf[self.base[r]].xfxQ(fl.id[f], xgrid.x[ix], Q)
+
+        # precomputing averages
+        self.f0 = numpy.zeros(shape=(fl.n, xgrid.n))
+        for f in range(fl.n):
+            for ix in range(xgrid.n):
+                self.f0[f, ix] = self.pdf[0].xfxQ(fl.id[f], xgrid.x[ix], Q)
+
+        # compute std dev
+        self.std = numpy.std(self.xfxQ, axis=0, ddof=1)
+
+        # lower, upper 68cl
+        self.std68 = numpy.zeros(shape=(fl.n, xgrid.n))
+        for f in range(fl.n):
+            low, up = get_limits(self.xfxQ[:,f,:], self.f0[f,:])
+            self.std68[f] = (up-low)/2.0
+
+        # maximum difference between std vs 68cl. -> create pandas array
+        self.mask = numpy.array([ abs(1 - self.std[f,:]/self.std68[f,:]) <= eps for f in range(fl.n)])
+        print " [Info] Keeping ", numpy.count_nonzero(self.mask), "nf*nx using (1-std/68cl) <= eps =", eps
+
+    @jit
+    def fill_cov(self, nf, nx, xfxQ, f0, mask):
+        n = len(xfxQ)
+        cov = numpy.zeros(shape=(nf*nx,nf*nx))
+        for fi in range(nf):
+            for fj in range(nf):
+                for ix in range(nx):
+                    for jx in range(nx):
+                        if mask[fi, ix] and mask[fj, jx]:
+                            i = nx*fi+ix
+                            j = nx*fj+jx
+                            for r in range(n):
+                                cov[i, j] += (xfxQ[r, fi, ix] - f0[fi,ix])*(xfxQ[r, fj, jx] - f0[fj,jx])
+        return cov/(n-1.0)
+
+    def pdfcovmat(self):
+        """ Build PDF covariance matrices """
+        cov = self.fill_cov(self.fl.n, self.xgrid.n, self.xfxQ, self.f0, self.mask)
+        return cov
+
+    @jit
+    def rebase(self, basis):
+        fin = numpy.sort(basis)
+        ind = 0
+        negative = numpy.zeros(shape=(self.n_rep-self.n), dtype=numpy.int64)
+        for i in range(self.n_rep):
+            it = False
+            for j in fin:
+                if j == i+1: it = True
+            if it == False:
+                negative[ind] = i+1
+                ind+=1
+        self.fin = fin
+        self.base = numpy.append(fin, negative)
+
+        # precomputing values
+        for r in range(self.n_rep):
+            for f in range(self.fl.n):
+                for ix in range(self.xgrid.n):
+                    self.xfxQ[r, f, ix] = self.pdf[self.base[r]].xfxQ(self.fl.id[f], self.xgrid.x[ix], self.Q)
+
+def get_limits(xfxQ, f0):
+    reps,l = xfxQ.shape
+    d = numpy.abs(xfxQ - f0)
+    ind = numpy.argsort(d, axis=0)
+    ind68 = 68*reps//100
+    sr = xfxQ[ind,numpy.arange(0,l)][:ind68,:]
+    up1s = numpy.max(sr,axis=0)
+    low1s = numpy.min(sr,axis=0)
+    return low1s, up1s
+
+class XGrid:
+    """ The x grid points used by the test """
+    def __init__(self, xminlog=1e-5, xminlin=1e-1, nplog=25, nplin=25):
+        self.x = numpy.append(numpy.logspace(numpy.log10(xminlog), numpy.log10(xminlin), num=nplog, endpoint=False),
+                              numpy.linspace(xminlin, 0.9, num=nplin, endpoint=False))
+        self.n = len(self.x)
+
+class Flavors:
+    """ The flavor container """
+    def __init__(self, nf=3):
+        self.id = numpy.arange(-nf,nf+1)
+        self.n = len(self.id)
+
+def invcov_sqrtinvcov(cov):
+    val, vec = numpy.linalg.eigh(cov)
+    invval = numpy.zeros(shape=len(val))
+    sqrtval = numpy.zeros(shape=len(val))
+    for i in range(len(val)):
+        if val[i] > 1e-12:
+            invval[i] = 1.0/val[i]
+            sqrtval[i] = 1.0/val[i]**0.5
+        else:
+            print " [Warning] Removing eigenvalue", i, val[i]
+
+    invcov = numpy.dot(vec, numpy.diag(invval)).dot(vec.T)
+    sqrtinvcov = numpy.dot(vec, numpy.diag(sqrtval)).dot(vec.T)
+    return invcov, sqrtinvcov
diff --git a/extra/basisgen.py b/extra/basisgen.py
diff --git a/lh.py b/lh.py
@@ -0,0 +1,137 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 21 22:00:01 2015
+
+@author: zah
+"""
+
+import os
+import sys
+import shutil
+import lhapdf
+import numpy as np
+
+import pandas as pd
+
+def split_sep(f):
+    for line in f:
+        if line.startswith(b'---'):
+            break
+        yield line
+
+def read_xqf_from_file(f):
+
+    lines = split_sep(f)
+    try:
+        (xtext, qtext, ftext) = [next(lines) for _ in range(3)]
+    except StopIteration:
+        return None
+    xvals = np.fromstring(xtext, sep = " ")
+    qvals = np.fromstring(qtext, sep = " ")
+    fvals = np.fromstring(ftext, sep = " ", dtype=np.int)
+    vals = np.fromstring(b''.join(lines), sep= " ")
+    return pd.Series(vals, index = pd.MultiIndex.from_product((xvals, qvals, fvals)))
+
+
+def read_xqf_from_lhapdf(pdf, rep0grids):
+    indexes = tuple(rep0grids.index)
+    vals = []
+    for x in indexes:
+        vals += [pdf.xfxQ(x[3],x[1],x[2])]
+    return pd.Series(vals, index = rep0grids.index)
+
+def read_all_xqf(f):
+    while True:
+        result = read_xqf_from_file(f)
+        if result is None:
+            return
+        yield result
+
+#TODO: Make pdf_name the pdf_name instead of path
+def load_replica_2(rep, pdf_name, pdf=None, rep0grids=None):
+
+    sys.stdout.write("-> Reading replica from LHAPDF %d \r" % rep)
+    sys.stdout.flush()
+
+    suffix = str(rep).zfill(4)
+    with open(pdf_name + "_" + suffix + ".dat", 'rb') as inn:
+        header = b"".join(split_sep(inn))
+
+        if rep0grids is not None:
+            xfqs = read_xqf_from_lhapdf(pdf, rep0grids)
+        else:
+            xfqs = list(read_all_xqf(inn))
+            xfqs = pd.concat(xfqs, keys=range(len(xfqs)))
+    return header, xfqs
+
+#Split this to debug easily
+def _rep_to_buffer(out, header, subgrids):
+    sep = b'---'
+    out.write(header)
+    out.write(sep)
+    for _,g in subgrids.groupby(level=0):
+        out.write(b'\n')
+        ind = g.index.get_level_values(1).unique()
+        np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ')
+        out.write(b'\n')
+        ind = g.index.get_level_values(2).unique()
+        np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ')
+        out.write(b'\n')
+        #Integer format
+        ind = g.index.get_level_values(3).unique()
+        np.savetxt(out, ind, delimiter=' ', fmt="%d",
+                      newline=' ')
+        out.write(b'\n ')
+        #Reshape so printing is easy
+        reshaped = g.reshape((len(g.groupby(level=1))*len(g.groupby(level=2)),
+                              len(g.groupby(level=3))))
+        np.savetxt(out, reshaped, delimiter=" ", newline="\n ", fmt='%14.7E')
+        out.write(sep)
+
+def write_replica(rep, pdf_name, header, subgrids):
+    suffix = str(rep).zfill(4)
+    with open(pdf_name + "_" + suffix + ".dat", 'wb') as out:
+        _rep_to_buffer(out, header, subgrids)
+
+def load_all_replicas(pdf, pdf_name):
+    rep0headers, rep0grids = load_replica_2(0,pdf_name)
+    headers, grids = zip(*[load_replica_2(rep, pdf_name, pdf.pdf[pdf.base[rep]], rep0grids) for rep in range(pdf.n)])
+    return [rep0headers] + list(headers), [rep0grids] + list(grids)
+
+def big_matrix(gridlist):
+    central_value = gridlist[0]
+    X = pd.concat(gridlist[1:], axis=1,
+                 keys=range(1,len(gridlist)+1), #avoid confusion with rep0
+                 ).subtract(central_value, axis=0)
+    if np.any(X.isnull()) or X.shape[0] != len(central_value):
+        raise ValueError("Incompatible grid specifications")
+    return X
+
+def hessian_from_lincomb(pdf, V):
+
+    # preparing output folder
+    neig = V.shape[1]
+
+    base = lhapdf.paths()[0] + "/" + pdf.pdf_name + "/" + pdf.pdf_name
+    file = pdf.pdf_name + "_hessian_" + str(neig)
+    if not os.path.exists(file): os.makedirs(file)
+
+    # copy replica 0
+    shutil.copy(base + "_0000.dat", file + "/" + file + "_0000.dat")
+
+    inn = open(base + ".info", 'rb')
+    out = open(file + "/" + file + ".info", 'wb')
+    for l in inn.readlines():
+        if l.find("SetDesc:") >= 0: out.write("SetDesc: \"Hessian " + pdf.pdf_name + "_hessian\"\n")
+        elif l.find("NumMembers:") >= 0: out.write("NumMembers: " + str(neig+1) + "\n")
+        elif l.find("ErrorType: replicas") >= 0: out.write("ErrorType: symmhessian\n")
+        else: out.write(l)
+    inn.close()
+    out.close()
+
+    headers, grids = load_all_replicas(pdf, base)
+    hess_name = file + '/' + file
+    result  = (big_matrix(grids).dot(V)).add(grids[0], axis=0, )
+    hess_header = b"PdfType: error\nFormat: lhagrid1\n"
+    for column in result.columns:
+        write_replica(column + 1, hess_name, hess_header, result[column])