Source code for armi.bookkeeping.db.compareDB3

# Copyright 2019 TerraPower, LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Use the generic database class to compare two ARMI databases.

This assumes some intimate knowledge about how the database is structured internally.
For instance, it knows that the database is composed of HDF5 data (the attrs of a
dataset are used, and h5py Groups are indexed), and it knows how special data is
structured within the HDF5 dataset and what the corresponding attributes are used for.
Some of this could be easily pulled up to the public interfaces of the Database class,
which may allow for cross-version database checking, but there is probably little value
in doing so if one is able to convert between versions.

Speaking of conversions, there are some common issues that may arise from comparing
against databases that were converted from an old version. The process of reading in the
old database values can sometimes lead to more parameters being written out to the new
database than were in the original database (set to the parameter's default value). That
means that one generally should not be worried about a converted database having more
parameters in it that the one produced directly may not, assuming that the extra
converted parameters are the default. Also, especially at the Component level, some of
the parameters are expected to be different. Specifically the following:

* temperatures: The old database format simply did not store these on the component
  level, so when converting a database, the components in a block will uniformly get
  whatever the Block temperature was.
* serial numbers: At all levels, we cannot really expect the serial numbers to line
  up from object to object. These are not really supposed to be the same.
* volume: Component volumes also are not stored on the database, and come from
  temperatures
* memory usage: Relatively self-evident. Resource usage will vary from run to run,
  even if the code hasn't changed.

"""
from typing import Sequence, Optional, Pattern, Tuple
import collections
import os
import re
import traceback

from tabulate import tabulate
import h5py
import numpy

from armi import runLog
from armi.bookkeeping.db import database3
from armi.bookkeeping.db.database3 import Database3
from armi.bookkeeping.db.factory import databaseFactory
from armi.bookkeeping.db.permissions import Permissions
from armi.reactor.composites import ArmiObject


[docs]class OutputWriter: """Basically a tee to writeln to runLog and the output file.""" def __init__(self, fname): self.fname = fname self._stream = None def __enter__(self): self._stream = open(self.fname, "w") return self def __exit__(self, *args): self._stream.close()
[docs] def writeln(self, msg: str) -> None: runLog.info(msg) self._stream.write(msg) self._stream.write("\n")
[docs]class DiffResults: """Utility class for storing differences between database data. This class is used to store the differences between reference data and other ("source") data. It is configured with a tolerance, below which differences are ignored. Differences that exceed the tolerance are stored in a collection of differences, organized by time step to be outputted later. It also keeps track of the number of issues that may have been encountered in attempting to compare two databases. For instance, missing datasets on one database or the other, or datasets with incompatible dimensions and the like. All differences are based on a weird type of relative difference, which uses the mean of the reference and source data elements as the normalization value: 2*(C-E)/(C+E). This is somewhat strange, in that if the two are very different, the reported relative difference will be smaller than expected. It does have the useful property that if the reference value is zero and the source value is non-zero, the diff will not be infinite. We do not typically report these in any rigorous manner, so this should be fine, though we may wish to revisit this in the future. """ def __init__(self, tolerance): self._columns = [] self._structureDiffs = [] self.tolerance = tolerance # diffs is a dictionary, keyed on strings describing the object to which the # diffs apply, and the different diff metrics that we use (e.g. mean(abs(diff)), # max(abs(diff))), with the values being a list of diffs by time step. If the # diff doesn't exceed the tolerance, a None is inserted instead. self.diffs = collections.defaultdict(self._getDefault)
[docs] def addDiff( self, compType: str, paramName: str, absMean: float, mean: float, absMax: float ) -> None: """Add a collection of diffs to the diff dictionary if they exceed the tolerance.""" absMean = absMean if absMean > self.tolerance else None self.diffs["{}/{} mean(abs(diff))".format(compType, paramName)].append(absMean) mean = mean if abs(mean) > self.tolerance else None self.diffs["{}/{} mean(diff)".format(compType, paramName)].append(mean) absMax = absMax if absMax > self.tolerance else None self.diffs["{}/{} max(abs(diff))".format(compType, paramName)].append(absMax)
[docs] def addStructureDiffs(self, nDiffs: int) -> None: if not self._structureDiffs: self._structureDiffs = [0] self._structureDiffs[-1] += nDiffs
[docs] def addTimeStep(self, tsName: str) -> None: self._structureDiffs.append(0) self._columns.append(tsName)
def _getDefault(self) -> list: return [None] * (len(self._columns) - 1)
[docs] def reportDiffs(self, stream: OutputWriter) -> None: """Print out a well-formatted table of the non-zero diffs.""" # filter out empty rows diffsToPrint = { key: value for key, value in self.diffs.items() if not all(v is None for v in value) } stream.writeln( tabulate( [k.split() + val for k, val in sorted(diffsToPrint.items())], headers=self._columns, ) )
[docs] def nDiffs(self) -> int: """Return the number of differences that exceeded the tolerance.""" return sum( 1 for _, value in self.diffs.items() if any(v is not None for v in value) ) + sum(self._structureDiffs)
[docs]def compareDatabases( refFileName: str, srcFileName: str, exclusions: Optional[Sequence[str]] = None, tolerance: float = 0.0, timestepCompare: Optional[Sequence[Tuple[int, int]]] = None, ) -> Optional[DiffResults]: """High-level method to compare two ARMI H5 files, given file paths.""" compiledExclusions = None if exclusions is not None: compiledExclusions = [re.compile(ex) for ex in exclusions] outputName = ( os.path.basename(refFileName) + "_vs_" + os.path.basename(srcFileName) + ".txt" ) diffResults = DiffResults(tolerance) with OutputWriter(outputName) as out: ref = databaseFactory(refFileName, Permissions.READ_ONLY_FME) src = databaseFactory(srcFileName, Permissions.READ_ONLY_FME) if not isinstance(ref, Database3) or not isinstance(src, Database3): raise TypeError( "This database comparer only knows how to deal with database version " "3; received {} and {}".format(type(ref), type(src)) ) with ref, src: if not timestepCompare: _, nDiff = _compareH5Groups(out, ref, src, "timesteps") if nDiff > 0: runLog.warning( "{} and {} have differing timestep groups, and are " "probably not safe to compare. This is likely due to one of " "the cases having failed to complete.".format(ref, src) ) return None for refGroup, srcGroup in zip( ref.genTimeStepGroups(timeSteps=timestepCompare), src.genTimeStepGroups(timeSteps=timestepCompare), ): runLog.info( f"Comparing ref time step {refGroup.name.split('/')[1]} to src time " f"step {srcGroup.name.split('/')[1]}" ) diffResults.addTimeStep(refGroup.name) _compareTimeStep( out, refGroup, srcGroup, diffResults, exclusions=compiledExclusions ) diffResults.reportDiffs(out) return diffResults
def _compareH5Groups( out: OutputWriter, ref: h5py.Group, src: h5py.Group, name: str ) -> Tuple[Sequence[str], int]: refGroups = set(ref.keys()) srcGroups = set(src.keys()) n = _compareSets(srcGroups, refGroups, out, name) return sorted(refGroups & srcGroups), n def _compareTimeStep( out: OutputWriter, refGroup: h5py.Group, srcGroup: h5py.Group, diffResults: DiffResults, exclusions: Optional[Sequence[Pattern]] = None, ): groupNames, structDiffs = _compareH5Groups( out, refGroup, srcGroup, "composite objects/auxiliary data" ) diffResults.addStructureDiffs(structDiffs) componentTypes = {gn for gn in groupNames if gn in ArmiObject.TYPES} auxData = set(groupNames) - componentTypes auxData.discard("layout") for componentType in componentTypes: refTypeGroup = refGroup[componentType] srcTypeGroup = srcGroup[componentType] _compareComponentData( out, refTypeGroup, srcTypeGroup, diffResults, exclusions=exclusions ) for aux in auxData: _compareAuxData(out, refGroup[aux], srcGroup[aux], diffResults) def _compareAuxData( out: OutputWriter, refGroup: h5py.Group, srcGroup: h5py.Group, diffResults: DiffResults, ): """ Compare auxiliary datasets, which aren't stored as Parameters on the Composite model. Some parts of ARMI directly create HDF5 groups under the time step group to store arbitrary data. These still need to be compared. Missing datasets will be treated as structure differences and reported. """ data = dict() def visitor(name, obj): if isinstance(obj, h5py.Dataset): data[name] = obj refGroup.visititems(visitor) refData = data data = dict() srcGroup.visititems(visitor) srcData = data n = _compareSets( set(srcData.keys()), set(refData.keys()), out, name="auxiliary dataset" ) diffResults.addStructureDiffs(n) matchedSets = set(srcData.keys()) & set(refData.keys()) for name in matchedSets: _diffSimpleData(refData[name], srcData[name], diffResults) def _compareSets( src: set, ref: set, out: OutputWriter, name: Optional[str] = None ) -> int: nDiffs = 0 printName = "" if name is None else name + " " if ref - src: nDiffs += len(ref - src) out.writeln("ref has {}not in src: {}".format(printName, list(ref - src))) if src - ref: nDiffs += len(src - ref) out.writeln("src has {}not in ref: {}".format(printName, list(src - ref))) return nDiffs def _diffSpecialData( refData: h5py.Dataset, srcData: h5py.Dataset, out: OutputWriter, diffResults: DiffResults, ): """ Compare specially-formatted datasets. This employs the pack/unpackSpecialData functions to reconstitute complicated datasets for comparison. These usually don't behave well as giant numpy arrays, so we go element-by-element to calculate the diffs, then concatenate them. """ name = refData.name paramName = refData.name.split("/")[-1] compName = refData.name.split("/")[-2] nDiffs = _compareSets( set(srcData.attrs.keys()), set(refData.attrs.keys()), out, "formatting data" ) keysMatch = nDiffs == 0 diffResults.addStructureDiffs(nDiffs) if not keysMatch: diffResults.addDiff(name, name, numpy.inf, numpy.inf, numpy.inf) return if srcData.attrs.get("dict", False): # not bothering with dictionaries yet, though we will need to for things like # number densities return attrsMatch = True for k, srcAttr in srcData.attrs.items(): refAttr = refData.attrs[k] if isinstance(srcAttr, numpy.ndarray) and isinstance(refAttr, numpy.ndarray): srcFlat = srcAttr.flatten() refFlat = refAttr.flatten() if len(srcFlat) != len(refFlat): same = False else: same = all(srcFlat == refFlat) else: same = srcAttr == refAttr if not same: attrsMatch = False out.writeln( "Special formatting parameters for {} do not match for {}. Src: {} " "Ref: {}".format(name, k, srcData.attrs[k], refData.attrs[k]) ) break if not attrsMatch: return try: src = database3.unpackSpecialData(srcData[()], srcData.attrs, paramName) ref = database3.unpackSpecialData(refData[()], refData.attrs, paramName) except Exception: runLog.error( f"Unable to unpack special data for paramName {paramName}. " f"{traceback.format_exc()}", ) return diff = [] for dSrc, dRef in zip(src.tolist(), ref.tolist()): if isinstance(dSrc, numpy.ndarray) and isinstance(dRef, numpy.ndarray): if dSrc.shape != dRef.shape: out.writeln("Shapes did not match for {}".format(refData)) diffResults.addDiff( compName, paramName, numpy.inf, numpy.inf, numpy.inf ) return # make sure not to try to compare empty arrays. Numpy is mediocre at # these; they are super degenerate and cannot participate in concatenation. # Why? if 0 not in dSrc.shape: # Use the mean of the two to calc relative error. This is more robust to # changes that cause one of the values to be zero, while the other is # non-zero, leading to infinite relative error dMean = (dSrc + dRef) / 2 diff.append((dSrc - dRef) / dMean) continue if (dSrc is None) ^ (dRef is None): out.writeln("Mismatched Nones for {} in {}".format(paramName, compName)) diff.append([numpy.inf]) continue if dSrc is None: diff.append([0.0]) continue try: # Use mean to avoid some infinities; see above dMean = (dSrc + dRef) / 2 diff.append([(dSrc - dRef) / dMean]) except ZeroDivisionError: if dSrc == dRef: diff.append([0.0]) else: diff.append([numpy.inf]) if diff: try: diff = [numpy.array(d).flatten() for d in diff] diff = numpy.concatenate(diff) except ValueError as e: out.writeln( "Failed to concatenate diff data for {} in {}: {}".format( paramName, compName, diff ) ) out.writeln("Because: {}".format(e)) return absDiff = numpy.abs(diff) mean = numpy.nanmean(diff) absMax = numpy.nanmax(absDiff) absMean = numpy.nanmean(absDiff) diffResults.addDiff(compName, paramName, absMean, mean, absMax) def _diffSimpleData(ref: h5py.Dataset, src: h5py.Dataset, diffResults: DiffResults): paramName = ref.name.split("/")[-1] compName = ref.name.split("/")[-2] try: # use mean to avoid some unnecessary infinities mean = (src[()] + ref[()]) / 2.0 diff = (src[()] - ref[()]) / mean except TypeError: # Strings are persnickety if src.dtype.kind == ref.dtype.kind and src.dtype.kind in {"U", "S"}: return else: runLog.error("Failed to compare {} in {}".format(paramName, compName)) runLog.error("source: {}".format(src)) runLog.error("reference: {}".format(ref)) diff = numpy.array([numpy.inf]) except ValueError: runLog.error("Failed to compare {} in {}".format(paramName, compName)) runLog.error("source: {}".format(src)) runLog.error("reference: {}".format(ref)) diff = numpy.array([numpy.inf]) if 0 in diff.shape: # Empty list, no diff return absDiff = numpy.abs(diff) mean = numpy.nanmean(diff) absMax = numpy.nanmax(absDiff) absMean = numpy.nanmean(absDiff) diffResults.addDiff(compName, paramName, absMean, mean, absMax) def _compareComponentData( out: OutputWriter, refGroup: h5py.Group, srcGroup: h5py.Group, diffResults: DiffResults, exclusions: Optional[Sequence[Pattern]] = None, ): exclusions = exclusions or [] compName = refGroup.name paramNames, nDiff = _compareH5Groups( out, refGroup, srcGroup, "{} parameters".format(compName) ) diffResults.addStructureDiffs(nDiff) for paramName in paramNames: fullName = "/".join((refGroup.name, paramName)) if any(pattern.match(fullName) for pattern in exclusions): runLog.debug( "Skipping comparison of {} since it is being ignored.".format(fullName) ) continue refDataset = refGroup[paramName] srcDataset = srcGroup[paramName] srcSpecial = srcDataset.attrs.get("specialFormatting", False) refSpecial = refDataset.attrs.get("specialFormatting", False) if srcSpecial ^ refSpecial: out.writeln( "Could not compare data for parameter {} because one uses special " "formatting, and the other does not. Ref: {} Src: {}".format( paramName, refSpecial, srcSpecial ) ) diffResults.addDiff( refGroup.name, paramName, numpy.inf, numpy.inf, numpy.inf ) continue if srcSpecial or refSpecial: _diffSpecialData(refDataset, srcDataset, out, diffResults) else: _diffSimpleData(refDataset, srcDataset, diffResults)