Skip to content

Commit

Permalink
improving export grid performance
Browse files Browse the repository at this point in the history
  • Loading branch information
scarrazza committed Apr 24, 2015
1 parent feed362 commit 36cc689
Show file tree
Hide file tree
Showing 4 changed files with 276 additions and 423 deletions.
133 changes: 133 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
#!/usr/bin/env python
""" A Monte Carlo to Hessian conversion tool """

__author__ = 'Stefano Carrazza'
__license__ = 'GPL'
__version__ = '1.0.0'
__email__ = '[email protected]'


import numpy
import lhapdf
from numba import jit

class LocalPDF:
""" A simple class for PDF manipulation """
def __init__(self, pdf_name, nrep, xgrid, fl, Q, eps):
self.pdf_name = pdf_name
self.pdf = lhapdf.mkPDFs(pdf_name)
self.n = nrep
self.n_rep = len(self.pdf)-1
self.fl = fl
self.xgrid = xgrid
self.Q = Q
self.xfxQ = numpy.zeros(shape=(self.n_rep, fl.n, xgrid.n))
self.base = numpy.zeros(shape=self.n_rep, dtype=numpy.int64)
self.fin = numpy.zeros(shape=self.n, dtype=numpy.int64)

for i in range(self.n_rep): self.base[i] = i+1
for i in range(self.n): self.fin[i] = i+1

# precomputing values
for r in range(self.n_rep):
for f in range(fl.n):
for ix in range(xgrid.n):
self.xfxQ[r, f, ix] = self.pdf[self.base[r]].xfxQ(fl.id[f], xgrid.x[ix], Q)

# precomputing averages
self.f0 = numpy.zeros(shape=(fl.n, xgrid.n))
for f in range(fl.n):
for ix in range(xgrid.n):
self.f0[f, ix] = self.pdf[0].xfxQ(fl.id[f], xgrid.x[ix], Q)

# compute std dev
self.std = numpy.std(self.xfxQ, axis=0, ddof=1)

# lower, upper 68cl
self.std68 = numpy.zeros(shape=(fl.n, xgrid.n))
for f in range(fl.n):
low, up = get_limits(self.xfxQ[:,f,:], self.f0[f,:])
self.std68[f] = (up-low)/2.0

# maximum difference between std vs 68cl. -> create pandas array
self.mask = numpy.array([ abs(1 - self.std[f,:]/self.std68[f,:]) <= eps for f in range(fl.n)])
print " [Info] Keeping ", numpy.count_nonzero(self.mask), "nf*nx using (1-std/68cl) <= eps =", eps

@jit
def fill_cov(self, nf, nx, xfxQ, f0, mask):
n = len(xfxQ)
cov = numpy.zeros(shape=(nf*nx,nf*nx))
for fi in range(nf):
for fj in range(nf):
for ix in range(nx):
for jx in range(nx):
if mask[fi, ix] and mask[fj, jx]:
i = nx*fi+ix
j = nx*fj+jx
for r in range(n):
cov[i, j] += (xfxQ[r, fi, ix] - f0[fi,ix])*(xfxQ[r, fj, jx] - f0[fj,jx])
return cov/(n-1.0)

def pdfcovmat(self):
""" Build PDF covariance matrices """
cov = self.fill_cov(self.fl.n, self.xgrid.n, self.xfxQ, self.f0, self.mask)
return cov

@jit
def rebase(self, basis):
fin = numpy.sort(basis)
ind = 0
negative = numpy.zeros(shape=(self.n_rep-self.n), dtype=numpy.int64)
for i in range(self.n_rep):
it = False
for j in fin:
if j == i+1: it = True
if it == False:
negative[ind] = i+1
ind+=1
self.fin = fin
self.base = numpy.append(fin, negative)

# precomputing values
for r in range(self.n_rep):
for f in range(self.fl.n):
for ix in range(self.xgrid.n):
self.xfxQ[r, f, ix] = self.pdf[self.base[r]].xfxQ(self.fl.id[f], self.xgrid.x[ix], self.Q)

def get_limits(xfxQ, f0):
reps,l = xfxQ.shape
d = numpy.abs(xfxQ - f0)
ind = numpy.argsort(d, axis=0)
ind68 = 68*reps//100
sr = xfxQ[ind,numpy.arange(0,l)][:ind68,:]
up1s = numpy.max(sr,axis=0)
low1s = numpy.min(sr,axis=0)
return low1s, up1s

class XGrid:
""" The x grid points used by the test """
def __init__(self, xminlog=1e-5, xminlin=1e-1, nplog=25, nplin=25):
self.x = numpy.append(numpy.logspace(numpy.log10(xminlog), numpy.log10(xminlin), num=nplog, endpoint=False),
numpy.linspace(xminlin, 0.9, num=nplin, endpoint=False))
self.n = len(self.x)

class Flavors:
""" The flavor container """
def __init__(self, nf=3):
self.id = numpy.arange(-nf,nf+1)
self.n = len(self.id)

def invcov_sqrtinvcov(cov):
val, vec = numpy.linalg.eigh(cov)
invval = numpy.zeros(shape=len(val))
sqrtval = numpy.zeros(shape=len(val))
for i in range(len(val)):
if val[i] > 1e-12:
invval[i] = 1.0/val[i]
sqrtval[i] = 1.0/val[i]**0.5
else:
print " [Warning] Removing eigenvalue", i, val[i]

invcov = numpy.dot(vec, numpy.diag(invval)).dot(vec.T)
sqrtinvcov = numpy.dot(vec, numpy.diag(sqrtval)).dot(vec.T)
return invcov, sqrtinvcov
112 changes: 0 additions & 112 deletions extra/basisgen.py

This file was deleted.

137 changes: 137 additions & 0 deletions lh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 21 22:00:01 2015
@author: zah
"""

import os
import sys
import shutil
import lhapdf
import numpy as np

import pandas as pd

def split_sep(f):
for line in f:
if line.startswith(b'---'):
break
yield line

def read_xqf_from_file(f):

lines = split_sep(f)
try:
(xtext, qtext, ftext) = [next(lines) for _ in range(3)]
except StopIteration:
return None
xvals = np.fromstring(xtext, sep = " ")
qvals = np.fromstring(qtext, sep = " ")
fvals = np.fromstring(ftext, sep = " ", dtype=np.int)
vals = np.fromstring(b''.join(lines), sep= " ")
return pd.Series(vals, index = pd.MultiIndex.from_product((xvals, qvals, fvals)))


def read_xqf_from_lhapdf(pdf, rep0grids):
indexes = tuple(rep0grids.index)
vals = []
for x in indexes:
vals += [pdf.xfxQ(x[3],x[1],x[2])]
return pd.Series(vals, index = rep0grids.index)

def read_all_xqf(f):
while True:
result = read_xqf_from_file(f)
if result is None:
return
yield result

#TODO: Make pdf_name the pdf_name instead of path
def load_replica_2(rep, pdf_name, pdf=None, rep0grids=None):

sys.stdout.write("-> Reading replica from LHAPDF %d \r" % rep)
sys.stdout.flush()

suffix = str(rep).zfill(4)
with open(pdf_name + "_" + suffix + ".dat", 'rb') as inn:
header = b"".join(split_sep(inn))

if rep0grids is not None:
xfqs = read_xqf_from_lhapdf(pdf, rep0grids)
else:
xfqs = list(read_all_xqf(inn))
xfqs = pd.concat(xfqs, keys=range(len(xfqs)))
return header, xfqs

#Split this to debug easily
def _rep_to_buffer(out, header, subgrids):
sep = b'---'
out.write(header)
out.write(sep)
for _,g in subgrids.groupby(level=0):
out.write(b'\n')
ind = g.index.get_level_values(1).unique()
np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ')
out.write(b'\n')
ind = g.index.get_level_values(2).unique()
np.savetxt(out, ind, fmt='%.7E',delimiter=' ', newline=' ')
out.write(b'\n')
#Integer format
ind = g.index.get_level_values(3).unique()
np.savetxt(out, ind, delimiter=' ', fmt="%d",
newline=' ')
out.write(b'\n ')
#Reshape so printing is easy
reshaped = g.reshape((len(g.groupby(level=1))*len(g.groupby(level=2)),
len(g.groupby(level=3))))
np.savetxt(out, reshaped, delimiter=" ", newline="\n ", fmt='%14.7E')
out.write(sep)

def write_replica(rep, pdf_name, header, subgrids):
suffix = str(rep).zfill(4)
with open(pdf_name + "_" + suffix + ".dat", 'wb') as out:
_rep_to_buffer(out, header, subgrids)

def load_all_replicas(pdf, pdf_name):
rep0headers, rep0grids = load_replica_2(0,pdf_name)
headers, grids = zip(*[load_replica_2(rep, pdf_name, pdf.pdf[pdf.base[rep]], rep0grids) for rep in range(pdf.n)])
return [rep0headers] + list(headers), [rep0grids] + list(grids)

def big_matrix(gridlist):
central_value = gridlist[0]
X = pd.concat(gridlist[1:], axis=1,
keys=range(1,len(gridlist)+1), #avoid confusion with rep0
).subtract(central_value, axis=0)
if np.any(X.isnull()) or X.shape[0] != len(central_value):
raise ValueError("Incompatible grid specifications")
return X

def hessian_from_lincomb(pdf, V):

# preparing output folder
neig = V.shape[1]

base = lhapdf.paths()[0] + "/" + pdf.pdf_name + "/" + pdf.pdf_name
file = pdf.pdf_name + "_hessian_" + str(neig)
if not os.path.exists(file): os.makedirs(file)

# copy replica 0
shutil.copy(base + "_0000.dat", file + "/" + file + "_0000.dat")

inn = open(base + ".info", 'rb')
out = open(file + "/" + file + ".info", 'wb')
for l in inn.readlines():
if l.find("SetDesc:") >= 0: out.write("SetDesc: \"Hessian " + pdf.pdf_name + "_hessian\"\n")
elif l.find("NumMembers:") >= 0: out.write("NumMembers: " + str(neig+1) + "\n")
elif l.find("ErrorType: replicas") >= 0: out.write("ErrorType: symmhessian\n")
else: out.write(l)
inn.close()
out.close()

headers, grids = load_all_replicas(pdf, base)
hess_name = file + '/' + file
result = (big_matrix(grids).dot(V)).add(grids[0], axis=0, )
hess_header = b"PdfType: error\nFormat: lhagrid1\n"
for column in result.columns:
write_replica(column + 1, hess_name, hess_header, result[column])
Loading

0 comments on commit 36cc689

Please sign in to comment.