Merging branches

2025-06-07 02:13:10 +02:00 · 2024-12-28 11:36:23 +01:00
parent ae599a3659 dbebb6d9f2
commit f0c6141f2c
24 changed files with 1149 additions and 563 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,11 @@
 .vscode
 # Build
 build
 build/
 build/*
 *.o
 # Data
 data/
 # Resulting images
 *.ppm
--- a/load-modules.sh
+++ b/load-modules.sh
@@ -0,0 +1,6 @@
 #!/usr/bin/env bash
 # Load the needed modules in Habrok
 ml CUDA
 ml netCDF-C++4
--- a/10
+++ b/10
@@ -1,6 +1,7 @@
 # Compiler and flags
 NVCC = nvcc
-CXXFLAGS = -I./src -I./linalg -I./img -I./objs -std=c++17
+CXXFLAGS = -I./src -I./hurricanedata -std=c++17 $(shell nc-config --cxx4flags) $(shell nc-config --cxx4libs) -g -G
 COMPILE_OBJ_FLAGS = --device-c
 # Directories
 SRC_DIR = src
@@ -8,7 +9,7 @@ BUILD_DIR = build
 # Files
 TARGET = $(BUILD_DIR)/main
-SRC_FILES = $(wildcard $(SRC_DIR)/*.cu)
+SRC_FILES := $(shell find $(SRC_DIR) -type f \( -name '*.cu' -o -name '*.cpp' \))
 OBJ_FILES = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRC_FILES))
 # Default target
@@ -16,11 +17,12 @@ all: $(TARGET)
 # Build the main target
 $(TARGET): $(OBJ_FILES) | $(BUILD_DIR)
-	$(NVCC) $(CXXFLAGS) -o $@ $^
+	$(NVCC) $(CXXFLAGS) $^ -o $@
 # Compile object files
 $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
-	$(NVCC) $(CXXFLAGS) -c $< -o $@
+	@mkdir -p $(dir $@)
 	$(NVCC) $(CXXFLAGS) $(COMPILE_OBJ_FLAGS) -c $< -o $@
 # Debug build
 debug: CXXFLAGS += -g
--- a/src/hurricanedata/datareader.cu
+++ b/src/hurricanedata/datareader.cu
@@ -0,0 +1,77 @@
 #include "datareader.h"
 #include <netcdf>
 #include <cassert>
 using namespace std;
 using namespace netCDF;
 DataReader::DataReader(const std::string &path, std::string variableName):
 filePathManager(path), variableName(variableName) { }
 size_t DataReader::fileLength(size_t fileIndex) {
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
    multimap<string, NcVar> vars = data.getVars();
    NcVar var = vars.find(variableName)->second;   
    size_t length = 1;
    for (NcDim dim: var.getDims()) {
        length *= dim.getSize();
    }
    return length;
 }
 size_t DataReader::axisLength(size_t fileIndex, const std::string& axisName) {
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
    multimap<string, NcVar> vars = data.getVars();
    NcVar var = vars.find(axisName)->second;   
    assert(var.getDimCount() == 1);
    netCDF::NcDim dim = var.getDim(0);
    return dim.getSize();
 }
 template <typename T>
 void DataReader::loadFile(T* dataOut, size_t fileIndex) {
    loadFile(dataOut, fileIndex, variableName);
 }
 template <typename T>
 void DataReader::loadFile(T* dataOut, size_t fileIndex, const string& varName) {
    // TODO: We could make the index wrap around so that we dont lose data
    // using fileIndex % NUMBEROFFILES here.
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
    multimap<string, NcVar> vars = data.getVars();
    NcVar var = vars.find(varName)->second;   
    var.getVar(dataOut);
 }
 template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex, const string& variableName);
 template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex, const string& variableName);
 template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex, const string& variableName);
 template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex);
 template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex);
 template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex);
 DataReader::~DataReader() {
 }
 // template <typename T>
 // void DataReader::readAndAllocateAxis(T** axis_ptr, size_t *size, const string &varName) {
 //     assert(var.getDimCount() == 1);
 //     netCDF::NcDim dim = var.getDim(0);
 //     *size = dim.getSize();
 //     cudaError_t status = cudaMallocManaged(axis_ptr, *size*sizeof(T));
 //     var.getVar(*axis_ptr);
 // }
--- a/src/hurricanedata/datareader.h
+++ b/src/hurricanedata/datareader.h
@@ -0,0 +1,56 @@
 #ifndef DATAREADER_H
 #define DATAREADER_H
 #include <string>
 #include <experimental/propagate_const>
 #include <memory>
 #include "filepathmanager.h"
 /**
 * @brief Simple wrapper around all the NetCDF functionality.
 */
 class DataReader {
 public:
    /**
     * @brief Constructor.
     * @param path Path to the directory containing all the .nc4 files.
     * @param variableName The variable we are interested in.
     */
    DataReader(const std::string &path, std::string variableName);
    /**
     * @brief The length of the flat data in variableName.
     * Used for allocating the right amount of memory.
     */
    size_t fileLength(size_t fileIndex);
    /**
     * @brief Write all the data in file fileIndex of variable we re interested in into the dataOut.
     */
    template <typename T>
    void loadFile(T* dataOut, size_t fileIndex);
    /**
     * @brief Write all the data in file fileIndex of variable variableName into the dataOut.
     * @param dataOut pointer to memory that should be written to.
     * @param fileIndex the index of the file we want to load into memory.
     * @param variableName the name of the variable
     */
    template <typename T>
    void loadFile(T* dataOut, size_t fileIndex, const std::string& variableName);
    /**
     * @brief Get size of a variable.
     */
    size_t axisLength(size_t fileIndex, const std::string& axisName);
    ~DataReader();
 private:
    FilePathManager filePathManager;
    std::string variableName;
 };
 #endif //DATAREADER_H
--- a/src/hurricanedata/fielddata.cu
+++ b/src/hurricanedata/fielddata.cu
@@ -0,0 +1,21 @@
 #include "fielddata.h"
 __device__ float getVal(
    const FieldMetadata &md,
    const FieldData &d,
    const size_t &timeInd,
    const size_t &levInd,
    const size_t &latInd,
    const size_t &lonInd
 ) {
    size_t valArrInd = (d.fieldInd + timeInd) / md.numberOfTimeStepsPerFile;
    size_t trueTimeInd = (d.fieldInd + timeInd) % md.numberOfTimeStepsPerFile;
    size_t sizeSpatialData = md.widthSize*md.heightSize*md.depthSize;
    size_t size2DMapData = md.widthSize*md.heightSize;
    return d.valArrays[valArrInd][
        trueTimeInd*sizeSpatialData
        + levInd*size2DMapData
        + latInd*md.widthSize
        + lonInd
    ]; 
 }
--- a/src/hurricanedata/fielddata.h
+++ b/src/hurricanedata/fielddata.h
@@ -0,0 +1,58 @@
 #ifndef FIELDDATA_H
 #define FIELDDATA_H
 #include <vector>
 struct FieldMetadata {
    size_t widthSize; // Number of different longitudes
    size_t heightSize; // Number of different latitudes
    size_t depthSize; // Number of different levels
    size_t timeSize; // Number of different times
    // lons is a managed Unified Memory array of size widthCount that indicates 
    // that getVal(t, i, j, k) is a value with longitude of lons[i].
    // The other such arrays are similarly defined.
    double *lons; 
    double *lats;
    double *levs;
    int *times;
    size_t numberOfTimeStepsPerFile;
 };
 using FieldMetadata = FieldMetadata;
 /**
 * @brief Allows for accessing data of a time-slice of a scalar field
 * that may start in the middle of a file or go range over multiple files
 * by holding references to multiple files at a time.
 * 
 * @note Use the getVal method to index into it and get values.
 */
 struct FieldData {
    static constexpr size_t FILESNUM = 2; // Number of files stored in a FieldData struct.
    static constexpr size_t numberOfTimeStepsPerField = 2;
    size_t fieldInd; // Indicates 
    // A uniform array of length FILESNUM storing pointers to 4D arrays stored in device memory.
    float **valArrays;
 };
 /**
 * @brief Get the scalar field value at a particular index (timeInd, levInd, latInd, lonInd).
 * Note that the timeInd may be counter-intuitive. See explanation in gpubufferhandler.h.
 * 
 * @return scalar field float value.
 */
 extern __device__ float getVal(
    const FieldMetadata &md,
    const FieldData &d,
    const size_t &timeInd,
    const size_t &levInd,
    const size_t &latInd,
    const size_t &lonInd
 );
 #endif //FIELDDATA_H
--- a/src/hurricanedata/filepathmanager.cpp
+++ b/src/hurricanedata/filepathmanager.cpp
@@ -0,0 +1,40 @@
 #include <iostream>
 #include <vector>
 #include <string>
 #include <filesystem>
 #include <algorithm>
 #include "filepathmanager.h"
 namespace fs = std::filesystem;
 FilePathManager::FilePathManager(const std::string& path) {
    // Check if the directory exists
    if (!fs::exists(path) || !fs::is_directory(path)) {
        std::cerr << "Error: Directory does not exist or is not valid." << std::endl;
        exit(1);
    }
    // Iterate over directory entries
    for (const auto& entry : fs::directory_iterator(path)) {
        if (entry.is_regular_file() && entry.path().extension() == ".nc4") {
            fileNames.push_back(entry.path().string());
        }
    }
    // Sort file names lexicographically
    std::sort(fileNames.begin(), fileNames.end());
    // Print sorted file names
    std::cout << "Files in directory '" << path << "' sorted lexicographically:\n";
    for (const auto& fileName : fileNames) {
        std::cout << fileName << std::endl;
    }
 }
 const char* FilePathManager::getPath(size_t index) const {
    return fileNames.at(index).c_str();
 }
 FilePathManager::~FilePathManager() { }
--- a/src/hurricanedata/filepathmanager.h
+++ b/src/hurricanedata/filepathmanager.h
@@ -0,0 +1,28 @@
 #ifndef FILEPATHMANAGER_H
 #define FILEPATHMANAGER_H
 #include <string>
 #include <vector>
 /**
 * @brief Simple class that is responsible for mapping a file index to a file path.
 * So for example:
 * index = 0 -> MERRA2_400.inst6_3d_ana_Np.20120101.nc4
 * index = 1 -> MERRA2_400.inst6_3d_ana_Np.20120102.nc4
 * index = 2 -> MERRA2_400.inst6_3d_ana_Np.20120103.nc4
 * index = 3 -> MERRA2_400.inst6_3d_ana_Np.20120104.nc4
 * etc...
 */
 class FilePathManager {
 public:
    FilePathManager(const std::string& path);
    size_t getNumberOfFiles();
    const char* getPath(size_t index) const;
    ~FilePathManager();
 private:
    std::vector<std::string> fileNames;
 };
 #endif //FILEPATHMANAGER_H
--- a/src/hurricanedata/gpubuffer.cu
+++ b/src/hurricanedata/gpubuffer.cu
@@ -0,0 +1,277 @@
 #include "gpubuffer.h"
 #include <mutex>
 #include <thread>
 #include <queue>
 #include <cassert>
 #include <condition_variable>
 #include <iostream>
 #include <future>
 #include <variant>
 using namespace std;
 struct File {
    condition_variable cv;
    mutex m;
    bool valid;
    size_t size;
    float *h_data; // host data
    float *d_data; // device data
 };
 struct LoadFileJob {
    size_t fileIndex;
    size_t bufferIndex;
 };
 struct GetSizeJob {
    size_t fileIndex;
    promise<size_t> size;
 };
 struct GetAxisDoubleJob {
    size_t fileIndex;
    string axisName;
    promise<pair<size_t, double *>> axis;
 };
 struct GetAxisIntJob {
    size_t fileIndex;
    string axisName;
    promise<pair<size_t, int *>> axis;
 };
 using Job = variant<LoadFileJob, GetSizeJob, GetAxisIntJob, GetAxisDoubleJob>;
 class GPUBuffer::impl {
 public:
    impl(DataReader& dataReader);
    void loadFile(LoadFileJob job);
    void loadFile(size_t fileIndex, size_t bufferIndex);
    void getSize(GetSizeJob job);
    size_t getSize(size_t fileIndex); // Most probably blocking
    void getAxis(GetAxisDoubleJob job);
    void getAxis(GetAxisIntJob job);
    template <typename T>
    std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
    ~impl();
    File buffer[numBufferedFiles];
    // Thread worker things
    void worker();
    queue<Job> jobs;
    unique_ptr<thread> ioworker;
    cudaStream_t iostream;
    DataReader& dataReader;
    condition_variable queuecv;
    mutex queuecv_m;
    bool workerRunning = true;
 };
 GPUBuffer::GPUBuffer(DataReader& dataReader): pImpl(make_unique<impl>(dataReader)) { }
 size_t GPUBuffer::impl::getSize(size_t fileIndex) {
    promise<size_t> promise;
    future<size_t> future = promise.get_future();
    {
        lock_guard<mutex> lk(queuecv_m);
        jobs.push(GetSizeJob{fileIndex, move(promise)});
    }
    queuecv.notify_all();
    future.wait();
    return future.get();
 }
 template <typename T>
 pair<size_t, T *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
    return pImpl->getAxis<T>(fileIndex, axisName);
 }
 template pair<size_t, int *> GPUBuffer::getAxis<int>(size_t fileIndex, const string& axisName);
 template pair<size_t, double *> GPUBuffer::getAxis<double>(size_t fileIndex, const string& axisName);
 template <>
 pair<size_t, double *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
    promise<pair<size_t, double *>> promise;
    future<pair<size_t, double *>> future = promise.get_future();
    {
        lock_guard<mutex> lk(queuecv_m);
        jobs.push(GetAxisDoubleJob{fileIndex, axisName, move(promise)});
    }
    queuecv.notify_all();
    future.wait();
    return future.get();
 }
 template pair<size_t, double *> GPUBuffer::impl::getAxis<double>(size_t fileIndex, const string& axisName);
 template <>
 pair<size_t, int *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
    promise<pair<size_t, int *>> promise;
    future<pair<size_t, int *>> future = promise.get_future();
    {
        lock_guard<mutex> lk(queuecv_m);
        jobs.push(GetAxisIntJob{fileIndex, axisName, move(promise)});
    }
    queuecv.notify_all();
    future.wait();
    return future.get();
 }
 template pair<size_t, int *> GPUBuffer::impl::getAxis<int>(size_t fileIndex, const string& axisName);
 GPUBuffer::impl::impl(DataReader& dataReader): dataReader(dataReader) {
    cudaStreamCreate(&iostream);
    ioworker = make_unique<thread>([this]() { worker(); });
    size_t size = getSize(0);
    auto x = getAxis<int>(0, "time");
    size_t sizeTime = x.first;
    cudaFree(x.second);
    for (size_t i = 0; i < numBufferedFiles; i++) {
        {
            File &file = buffer[i];
            lock_guard<mutex> lk(file.m);
            cudaMallocHost(&file.h_data, sizeof(float)*size);
            cudaMalloc(&file.d_data, sizeof(float)*size);
            file.size = size;
            file.valid = false;
        }
        // loadFile(i, i);
    }
 }
 GPUBuffer::~GPUBuffer() {
 }
 GPUBuffer::impl::~impl() {
    {
        lock_guard<mutex> lk(queuecv_m);
        workerRunning = false;
    }
    queuecv.notify_all();
    ioworker->join();
    for (size_t i = 0; i < numBufferedFiles; i++) {
        File &file = buffer[i];
        cudaFree(file.d_data);
        cudaFreeHost(file.h_data);
    }
    cudaStreamDestroy(iostream);
 }
 void GPUBuffer::impl::loadFile(LoadFileJob job) {
    File &file = buffer[job.bufferIndex];
    {
        lock_guard<mutex> lk(file.m);
        assert(!file.valid);
        cout << "loading file with index " << job.fileIndex << "\n";
        dataReader.loadFile<float>(file.h_data, job.fileIndex);
        cudaMemcpyAsync(file.d_data, file.h_data, sizeof(float)*file.size, cudaMemcpyHostToDevice, iostream);
        cudaStreamSynchronize(iostream);
        buffer[job.bufferIndex].valid = true;
    }
    file.cv.notify_all();
 }
 void GPUBuffer::impl::getSize(GetSizeJob job) {
    size_t size = dataReader.fileLength(job.fileIndex);
    job.size.set_value(size);
 }
 void GPUBuffer::impl::getAxis(GetAxisDoubleJob job) {
    pair<size_t, double *> array;
    array.first = dataReader.axisLength(job.fileIndex, job.axisName);
    cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(double));
    dataReader.loadFile<double>(array.second, job.fileIndex, job.axisName);
    job.axis.set_value(array);
 }
 void GPUBuffer::impl::getAxis(GetAxisIntJob job) {
    pair<size_t, int *> array;
    array.first = dataReader.axisLength(job.fileIndex, job.axisName);
    cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(int));
    dataReader.loadFile<int>(array.second, job.fileIndex, job.axisName);
    job.axis.set_value(array);
 }
 void GPUBuffer::loadFile(size_t fileIndex, size_t bufferIndex) {
    pImpl->loadFile(fileIndex, bufferIndex);
 }
 void GPUBuffer::impl::loadFile(size_t fileIndex, size_t bufferIndex) {
    LoadFileJob job = {
        .fileIndex = fileIndex,
        .bufferIndex = bufferIndex
    };
    // Main thread theoretically blocks on 2 mutexes here
    // but it _should_ never have to wait for them.
    {
        File &file = buffer[bufferIndex];
        std::unique_lock<std::mutex> lk(file.m, std::defer_lock);
        bool lockval = lk.try_lock();
        if (!lockval) {
            cout << "waited on GPUBuffer during loadFile orchestration :(\n";
            lk.lock();
        }
        file.valid = false;
    }
    {
        std::unique_lock<std::mutex> lk(queuecv_m, std::defer_lock);
        bool lockval = lk.try_lock();
        if (!lockval) {
            cout << "waited on IOworker queue during loadFile orchestration :(\n";
            lk.lock();
        }
        jobs.push(job);
    }
    queuecv.notify_all();
 }
 void GPUBuffer::impl::worker() {
    while(workerRunning) {
        Job job;
        {
            unique_lock<mutex> lk(queuecv_m);
            queuecv.wait(lk, [this]{ return !workerRunning || !jobs.empty(); });
            if (!workerRunning) {
                return;
            }
            job = move(jobs.front());
            jobs.pop();
        }
        if(holds_alternative<LoadFileJob>(job)) {
            loadFile(get<LoadFileJob>(job));
        } else if(holds_alternative<GetSizeJob>(job)) {
            getSize(move(get<GetSizeJob>(job)));
        } else if(holds_alternative<GetAxisDoubleJob>(job)) {
            getAxis(move(get<GetAxisDoubleJob>(job)));
        } else if(holds_alternative<GetAxisIntJob>(job)) {
            getAxis(move(get<GetAxisIntJob>(job)));
        }
    }
 }
 DataHandle GPUBuffer::getDataHandle(size_t bufferIndex) {
    File &file = pImpl->buffer[bufferIndex];
    // TODO: Might be nice to measure the blocking time here.
    unique_lock<mutex> lk(file.m);
    file.cv.wait(lk, [this, bufferIndex]{ return  pImpl->buffer[bufferIndex].valid; });
    DataHandle dh = {
        .d_data = file.d_data,
        .size = file.size
    };
    return dh;
 }
--- a/src/hurricanedata/gpubuffer.h
+++ b/src/hurricanedata/gpubuffer.h
@@ -0,0 +1,60 @@
 #ifndef GPUBUFFER_H
 #define GPUBUFFER_H
 #include <string>
 #include <memory>
 #include <experimental/propagate_const>
 #include "datareader.h"
 struct DataHandle {
    float *d_data; // Device memory
    size_t size;
    // Could include the data stored in host memory h_data in this handle if it were needed.
 };
 /**
 * @brief Handles the asynchronous (un)loading data to the GPU. The rest of the
 * application should not have to interface directly with this class. Getting data
 * should go over the GPUBufferHandler class.
 * 
 * @note This class uses a queue to give loading jobs to the worker thread.
 * NetCDF-C is not thread safe so you may never read data using netCDF directly
 * on any other thread than this worker thread.
 * 
 * Assumes all the data in the nc4 files is the same size.
 * 
 */
 class GPUBuffer {
 public:
    static constexpr size_t numBufferedFiles = 3; // Number of files stored in memory at one time.
    GPUBuffer(DataReader& dataReader);
    /**
     * @brief Asynchronously tells the GPUBuffer to eventually load a particular file index
     * into a buffer index (in which part of the buffer the file should be stored).
     */
    void loadFile(size_t fileIndex, size_t bufferIndex); // No blocking
    /**
     * @brief Get the values stored in a particular buffer index
     * @return A struct DataHandle that points to the memory and gives its size.
     */
    DataHandle getDataHandle(size_t bufferIndex); // Potentially blocking
    /**
     * @brief Get the data from an axis (e.g. longitude) and its size.
     * @note This is a blocking operation, so it makes a job for the worker
     * to read the data and then waits untill the job is completed.
     */
    template <typename T>
    std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
    ~GPUBuffer();
 private:
    class impl;
    std::experimental::propagate_const<std::unique_ptr<impl>> pImpl;
 };
 #endif //GPUBUFFER_H
--- a/src/hurricanedata/gpubufferhandler.cu
+++ b/src/hurricanedata/gpubufferhandler.cu
@@ -0,0 +1,93 @@
 #include "gpubufferhandler.h"
 #include "fielddata.h"
 #include <iostream>
 using namespace std;
 GPUBufferHandler::GPUBufferHandler(GPUBuffer& gpuBuffer):
 gpuBuffer(gpuBuffer), fieldInd(0), bufferInd(0), fileInd(0) {
    cudaMallocManaged(&fmd, sizeof(FieldMetadata));
    auto [widthSize, lons] = gpuBuffer.getAxis<double>(0, "lon");
    fmd->widthSize = widthSize;
    fmd->lons = lons;
    auto [heightSize, lats] = gpuBuffer.getAxis<double>(0, "lat");
    fmd->heightSize = heightSize;
    fmd->lats = lats;
    auto [depthSize, levs] = gpuBuffer.getAxis<double>(0, "lev");
    fmd->depthSize = depthSize;
    fmd->levs = levs;
    for (size_t i = 0; i < GPUBuffer::numBufferedFiles; i++) {
        gpuBuffer.loadFile(fileInd,fileInd);
        fileInd++;
    }
    fmd->timeSize = FieldData::numberOfTimeStepsPerField;
    cudaMallocManaged(&fmd->times, sizeof(fmd->numberOfTimeStepsPerFile*sizeof(int)));
    auto [numberOfTimeStepsPerFile, times] = gpuBuffer.getAxis<int>(0, "time");
    fmd->numberOfTimeStepsPerFile = numberOfTimeStepsPerFile;
    fmd->times = times;
    cudaMallocManaged(&valArrays, sizeof(float *)*FieldData::FILESNUM);
 }
 FieldData GPUBufferHandler::setupField(size_t newEndBufferInd) {
    FieldData fd;
    size_t fieldDataInd = 0;
    fd.valArrays = valArrays;
    cout << "getting field from files " << bufferInd  << " to " << newEndBufferInd << " with a field index of " << fieldInd << "\n";
    for (int i = bufferInd; i <= newEndBufferInd; i++) {
        DataHandle x = gpuBuffer.getDataHandle(i);
        fd.valArrays[fieldDataInd] = x.d_data;
        fieldDataInd++;
    }
    fd.fieldInd = fieldInd;
    return fd;
 }
 FieldData GPUBufferHandler::nextFieldData() {
    DataHandle x = gpuBuffer.getDataHandle(bufferInd);
    size_t newFieldInd = (fieldInd + 1) % fmd->numberOfTimeStepsPerFile;
    size_t newBufferInd = (bufferInd + ((fieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
    size_t endFieldInd = (fieldInd + FieldData::numberOfTimeStepsPerField - 1) % fmd->numberOfTimeStepsPerFile;
    size_t endBufferInd = (bufferInd + (fieldInd + FieldData::numberOfTimeStepsPerField - 1)/fmd->numberOfTimeStepsPerFile) % GPUBuffer::numBufferedFiles;
    size_t newEndFieldInd = (endFieldInd + 1) % fmd->numberOfTimeStepsPerFile;
    size_t newEndBufferInd = (endBufferInd + ((endFieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
    if(firstTimeStep) {
        firstTimeStep = false;
        return setupField(endBufferInd);
    } 
    fieldInd = newFieldInd;
    if (newBufferInd != bufferInd) {
        gpuBuffer.loadFile(fileInd, bufferInd);
        fileInd++;
        bufferInd = newBufferInd;
    }
    if (newEndBufferInd != endBufferInd) {
        // maybe dont do things?
    }
    return setupField(newEndBufferInd);
 }
 GPUBufferHandler::~GPUBufferHandler() {
    cudaFree(fmd->lons);
    cudaFree(fmd->lats);
    cudaFree(fmd->levs);
    cudaFree(valArrays);
    cudaFree(fmd);
 }
--- a/src/hurricanedata/gpubufferhandler.h
+++ b/src/hurricanedata/gpubufferhandler.h
@@ -0,0 +1,64 @@
 #ifndef GPUBUFFERHANDLER_H
 #define GPUBUFFERHANDLER_H
 #include "fielddata.h"
 #include "gpubuffer.h"
 #include <string>
 /**
 * @brief Responsible for deciding when the GPUBuffer should load and unload files.
 * Also assembles and gives access to FieldData.
 * 
 * You will need to interface with this class.
 */
 class GPUBufferHandler {
 public:
    GPUBufferHandler(GPUBuffer& gpuBuffer);
    /**
     * @brief Produces a FieldData which can be used to retrieve values for a time-slice
     * into a scalar field with a width of FieldData::numberOfTimeStepsPerField.
     * 
     * @details This method always increments the start point of the time-slice
     * by 1. See below:
     * 
     * time steps                    =  0  1  2  3  4  5  6  7  8  9  10  11  12 13 ...
     * files (4 time steps per file) = [0  1  2  3][4  5  6  7][8  9  10  11][12 13 ...
     * nextFieldData() (1st call)    = [0  1]
     * nextFieldData() (2nd call)    =    [1  2]
     * nextFieldData() (3rd call)    =       [2  3]
     * nextFieldData() (4th call)    =          [3  4]
     * nextFieldData() (5th call)    =             [4  5]
     * etc...
     *  
     * When getting values from a FieldData using the getVal method, 
     * the time index is relative to the start of the time-slice.
     * 
     * This means that for d = nextFieldData() (4th call),
     * getVal(.., fieldData=d, ..., timeInd = 1, ...) gives a value at 
     * absolute time step 5 as seen above.
     */
    FieldData nextFieldData();
    ~GPUBufferHandler();
    /**
     * You can get the FieldMetaData from here.
     */
    FieldMetadata *fmd;
    static void freeFieldData();
 private:
    FieldData setupField(size_t endBufferInd);
    GPUBuffer& gpuBuffer;
    size_t fileInd;
    size_t bufferInd;
    size_t fieldInd;
    bool firstTimeStep = true;
    float **valArrays;
 };
 #endif //GPUBUFFERHANDLER_H
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,224 +1,276 @@
-#include <iostream>
+// #include <iostream>
-#include <fstream>
+// #include <fstream>
-#include <cmath>
+// #include <cmath>
 // #include <cuda_runtime.h>
 // #include "linalg/linalg.h" 
 // #include "objs/sphere.h"
 // #include "img/handler.h"
 // // TODO: Eventually, export this into a better place (i.e., such that we do not have to recompile every time we change a parameter)
 // static const int VOLUME_WIDTH  = 1024;
 // static const int VOLUME_HEIGHT = 1024;
 // static const int VOLUME_DEPTH  = 1024;
 // static const int IMAGE_WIDTH   = 2560;
 // static const int IMAGE_HEIGHT  = 1440;
 // static const int SAMPLES_PER_PIXEL = 8;  // TODO: Right now uses simple variance, consider using something more advanced (e.g., some commonly-used noise map)
 // __constant__ int d_volumeWidth;
 // __constant__ int d_volumeHeight;
 // __constant__ int d_volumeDepth;
 // static float* d_volume = nullptr;  // TODO: Adjust according to how data is loaded
 // // ----------------------------------------------------------------------------------------------------
 // __device__ Vec3 phongShading(const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& baseColor) {
 //     double ambientStrength  = 0.3;
 //     double diffuseStrength  = 0.8;
 //     double specularStrength = 0.5;
 //     int    shininess        = 32;
 //     Vec3 ambient = baseColor * ambientStrength;
 //     double diff = fmax(normal.dot(lightDir), 0.0);
 //     Vec3 diffuse = baseColor * (diffuseStrength * diff);
 //     Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
 //     double spec = pow(fmax(viewDir.dot(reflectDir), 0.0), shininess);
 //     Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
 //     return ambient + diffuse + specular;
 // }
 // // Raycast + phong
 // __global__ void raycastKernel(float*  volumeData, unsigned char* framebuffer, int imageWidth, int imageHeight, Vec3 cameraPos, Vec3 cameraDir, Vec3 cameraUp, float fov, float stepSize, Vec3 lightPos) {
 //     int px = blockIdx.x * blockDim.x + threadIdx.x;
 //     int py = blockIdx.y * blockDim.y + threadIdx.y;
 //     if (px >= imageWidth || py >= imageHeight) return;
 //     float accumR = 0.0f;
 //     float accumG = 0.0f;
 //     float accumB = 0.0f;
 //     // Multiple samples per pixel
 //     for (int s = 0; s < SAMPLES_PER_PIXEL; s++) {
 //         // Map to [-1, 1]
 //         float u = ((px + 0.5f) / imageWidth ) * 2.0f - 1.0f;
 //         float v = ((py + 0.5f) / imageHeight) * 2.0f - 1.0f;
 //         // TODO: Move this (and all similar transformation code) to its own separate file
 //         float tanHalfFov = tanf(fov * 0.5f);
 //         u *= tanHalfFov;
 //         v *= tanHalfFov;
 //         // Find ray direction
 //         Vec3 cameraRight = (cameraDir.cross(cameraUp)).normalize();
 //         cameraUp = (cameraRight.cross(cameraDir)).normalize();
 //         Vec3 rayDir = (cameraDir + cameraRight*u + cameraUp*v).normalize();
 //         // Intersect (for simplicity just a 3D box from 0 to 1 in all dimensions) - TODO: Think about whether this is the best way to do this
 //         float tNear = 0.f;   // TODO: These are also linear transforms, so move away
 //         float tFar  = 1e6f;
 //         auto intersectAxis = [&](float start, float dirVal) {
 //             if (fabsf(dirVal) < 1e-10f) {  // TDDO: Add a constant - epsilon
 //                 if (start < 0.f || start > 1.f) {
 //                     tNear = 1e9f;
 //                     tFar  = -1e9f;
 //                 }
 //             } else {
 //                 float t0 = (0.0f - start) / dirVal;  // TODO: 0.0 and 1.0 depend on the box size -> move to a constant
 //                 float t1 = (1.0f - start) / dirVal;
 //                 if (t0>t1) { 
 //                     float tmp=t0; 
 //                     t0=t1; 
 //                     t1=tmp; 
 //                 }
 //                 if (t0>tNear) tNear = t0;
 //                 if (t1<tFar ) tFar  = t1;
 //             }
 //         };
 //         intersectAxis(cameraPos.x, rayDir.x);
 //         intersectAxis(cameraPos.y, rayDir.y);
 //         intersectAxis(cameraPos.z, rayDir.z);
 //         if (tNear > tFar) continue;  // No intersectionn
 //         if (tNear < 0.0f) tNear = 0.0f;
 //         float colorR = 0.f, colorG = 0.f, colorB = 0.f;
 //         float alphaAccum = 0.f;
 //         float tCurrent = tNear;
 //         while (tCurrent < tFar && alphaAccum < 0.99f) {
 //             Vec3 pos = cameraPos + rayDir * tCurrent;
 //             // Convert to volume indices
 //             float fx = pos.x * (d_volumeWidth  - 1);
 //             float fy = pos.y * (d_volumeHeight - 1);
 //             float fz = pos.z * (d_volumeDepth  - 1);
 //             int ix = (int)roundf(fx);
 //             int iy = (int)roundf(fy);
 //             int iz = (int)roundf(fz);
 //             // Sample
 //             float density = sampleVolumeNearest(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
 //             // Basic transfer function. TODO: Move to a separate file, and then improve
 //             float alphaSample = density * 0.05f;
 //             Vec3 baseColor = Vec3(density, 0.1f*density, 1.f - density);
 //             // If density ~ 0, skip shading
 //             if (density > 0.001f) {
 //                 Vec3 grad = computeGradient(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
 //                 Vec3 normal = -grad.normalize();
 //                 Vec3 lightDir = (lightPos - pos).normalize();
 //                 Vec3 viewDir  = -rayDir.normalize();
 //                 // Apply Phong
 //                 Vec3 shadedColor = phongShading(normal, lightDir, viewDir, baseColor);
 //                 // Compose
 //                 colorR     += (1.0f - alphaAccum) * shadedColor.x * alphaSample;
 //                 colorG     += (1.0f - alphaAccum) * shadedColor.y * alphaSample;
 //                 colorB     += (1.0f - alphaAccum) * shadedColor.z * alphaSample;
 //                 alphaAccum += (1.0f - alphaAccum) * alphaSample;
 //             }
 //             tCurrent += stepSize;
 //         }
 //         accumR += colorR;
 //         accumG += colorG;
 //         accumB += colorB;
 //     }
 //     // Average samples
 //     accumR /= (float)SAMPLES_PER_PIXEL;
 //     accumG /= (float)SAMPLES_PER_PIXEL;
 //     accumB /= (float)SAMPLES_PER_PIXEL;
 //     // Final colour
 //     int fbIndex = (py * imageWidth + px) * 3;
 //     framebuffer[fbIndex + 0] = (unsigned char)(fminf(accumR, 1.f) * 255);
 //     framebuffer[fbIndex + 1] = (unsigned char)(fminf(accumG, 1.f) * 255);
 //     framebuffer[fbIndex + 2] = (unsigned char)(fminf(accumB, 1.f) * 255);
 // }
 // int main(int argc, char** argv) {
 //     // Generate debug volume data
 //     float* hostVolume = new float[VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH];
 //     generateVolume(hostVolume, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH);
 //     // Allocate + copy data to GPU
 //     size_t volumeSize = sizeof(float) * VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH;
 //     cudaMalloc((void**)&d_volume, volumeSize);
 //     cudaMemcpy(d_volume, hostVolume, volumeSize, cudaMemcpyHostToDevice);
 //     int w = VOLUME_WIDTH, h = VOLUME_HEIGHT, d = VOLUME_DEPTH;
 //     cudaMemcpyToSymbol(d_volumeWidth,  &w, sizeof(int));
 //     cudaMemcpyToSymbol(d_volumeHeight, &h, sizeof(int));
 //     cudaMemcpyToSymbol(d_volumeDepth,  &d, sizeof(int));
 //     // Allocate framebuffer
 //     unsigned char* d_framebuffer;
 //     size_t fbSize = IMAGE_WIDTH * IMAGE_HEIGHT * 3 * sizeof(unsigned char);
 //     cudaMalloc((void**)&d_framebuffer, fbSize);
 //     cudaMemset(d_framebuffer, 0, fbSize);
 //     // Camera and Light
 //     Vec3 cameraPos(0.5, 0.5, -2.0);
 //     Vec3 cameraDir(0.0, 0.0, 1.0);
 //     Vec3 cameraUp(0.0, 1.0, 0.0);
 //     float fov = 60.0f * (M_PI / 180.0f);
 //     float stepSize = 0.002f;
 //     Vec3 lightPos(1.5, 2.0, -1.0);
 //     // Launch kernel
 //     dim3 blockSize(16, 16);
 //     dim3 gridSize((IMAGE_WIDTH + blockSize.x - 1)/blockSize.x,
 //                   (IMAGE_HEIGHT + blockSize.y - 1)/blockSize.y);
 //     raycastKernel<<<gridSize, blockSize>>>(
 //         d_volume,
 //         d_framebuffer,
 //         IMAGE_WIDTH,
 //         IMAGE_HEIGHT,
 //         cameraPos,
 //         cameraDir.normalize(),
 //         cameraUp.normalize(),
 //         fov,
 //         stepSize,
 //         lightPos
 //     );
 //     cudaDeviceSynchronize();
 //     // Copy framebuffer back to CPU
 //     unsigned char* hostFramebuffer = new unsigned char[IMAGE_WIDTH * IMAGE_HEIGHT * 3];
 //     cudaMemcpy(hostFramebuffer, d_framebuffer, fbSize, cudaMemcpyDeviceToHost);
 //     // Export image
 //     saveImage("output.ppm", hostFramebuffer, IMAGE_WIDTH, IMAGE_HEIGHT);
 //     // Cleanup
 //     delete[] hostVolume;
 //     delete[] hostFramebuffer;
 //     cudaFree(d_volume);
 //     cudaFree(d_framebuffer);
 //     std::cout << "Phong-DVR rendering done. Image saved to output.ppm" << std::endl;
 //     return 0;
 // }
 // gpu-buffer-handler branch main
 #include "hurricanedata/fielddata.h"
 #include "hurricanedata/gpubufferhandler.h"
 #include "hurricanedata/datareader.h"
 #include "hurricanedata/gpubuffer.h"
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <iostream>
 #include <cmath>
 #include <memory>
 #include <iomanip> 
-#include "linalg/linalg.h" 
+__global__ void middleOfTwoValues(float *ans, const FieldMetadata &fmd, FieldData fd) {
-#include "objs/sphere.h"
+    float xi = getVal(fmd, fd, 0, 20, 100, 100);
-#include "img/handler.h"
+    float yi = getVal(fmd, fd, 1, 20, 100, 100);
-
+    *ans = (xi+yi)/2;
 // TODO: Eventually, export this into a better place (i.e., such that we do not have to recompile every time we change a parameter)
 static const int VOLUME_WIDTH  = 1024;
 static const int VOLUME_HEIGHT = 1024;
 static const int VOLUME_DEPTH  = 1024;
 static const int IMAGE_WIDTH   = 2560;
 static const int IMAGE_HEIGHT  = 1440;
 static const int SAMPLES_PER_PIXEL = 8;  // TODO: Right now uses simple variance, consider using something more advanced (e.g., some commonly-used noise map)
 __constant__ int d_volumeWidth;
 __constant__ int d_volumeHeight;
 __constant__ int d_volumeDepth;
 static float* d_volume = nullptr;  // TODO: Adjust according to how data is loaded
 // ----------------------------------------------------------------------------------------------------
 __device__ Vec3 phongShading(const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& baseColor) {
    double ambientStrength  = 0.3;
    double diffuseStrength  = 0.8;
    double specularStrength = 0.5;
    int    shininess        = 32;
    Vec3 ambient = baseColor * ambientStrength;
    double diff = fmax(normal.dot(lightDir), 0.0);
    Vec3 diffuse = baseColor * (diffuseStrength * diff);
    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
    double spec = pow(fmax(viewDir.dot(reflectDir), 0.0), shininess);
    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
    return ambient + diffuse + specular;
 }
-// Raycast + phong
+int main() {
-__global__ void raycastKernel(float*  volumeData, unsigned char* framebuffer, int imageWidth, int imageHeight, Vec3 cameraPos, Vec3 cameraDir, Vec3 cameraUp, float fov, float stepSize, Vec3 lightPos) {
+    std::string path = "data/atmosphere_MERRA-wind-speed[179253532]";
    int px = blockIdx.x * blockDim.x + threadIdx.x;
    int py = blockIdx.y * blockDim.y + threadIdx.y;
    if (px >= imageWidth || py >= imageHeight) return;
-    float accumR = 0.0f;
+    std::string variable = "T";
    float accumG = 0.0f;
    float accumB = 0.0f;
-    // Multiple samples per pixel
+    DataReader dataReader{path, variable};
    for (int s = 0; s < SAMPLES_PER_PIXEL; s++) {
        // Map to [-1, 1]
        float u = ((px + 0.5f) / imageWidth ) * 2.0f - 1.0f;
        float v = ((py + 0.5f) / imageHeight) * 2.0f - 1.0f;
-        // TODO: Move this (and all similar transformation code) to its own separate file
+    std::cout << "created datareader\n";
        float tanHalfFov = tanf(fov * 0.5f);
        u *= tanHalfFov;
        v *= tanHalfFov;
-        // Find ray direction
+    GPUBuffer buffer (dataReader);
        Vec3 cameraRight = (cameraDir.cross(cameraUp)).normalize();
        cameraUp = (cameraRight.cross(cameraDir)).normalize();
        Vec3 rayDir = (cameraDir + cameraRight*u + cameraUp*v).normalize();
-        // Intersect (for simplicity just a 3D box from 0 to 1 in all dimensions) - TODO: Think about whether this is the best way to do this
+    std::cout << "created buffer\n";
        float tNear = 0.f;   // TODO: These are also linear transforms, so move away
        float tFar  = 1e6f;
-        auto intersectAxis = [&](float start, float dirVal) {
+    GPUBufferHandler bufferHandler(buffer);
            if (fabsf(dirVal) < 1e-10f) {  // TDDO: Add a constant - epsilon
                if (start < 0.f || start > 1.f) {
                    tNear = 1e9f;
                    tFar  = -1e9f;
                }
            } else {
                float t0 = (0.0f - start) / dirVal;  // TODO: 0.0 and 1.0 depend on the box size -> move to a constant
                float t1 = (1.0f - start) / dirVal;
                if (t0>t1) { 
                    float tmp=t0; 
                    t0=t1; 
                    t1=tmp; 
                }
                if (t0>tNear) tNear = t0;
                if (t1<tFar ) tFar  = t1;
            }
        };
-        intersectAxis(cameraPos.x, rayDir.x);
+    float *ptr_test_read;
-        intersectAxis(cameraPos.y, rayDir.y);
+    cudaMallocManaged(&ptr_test_read, sizeof(float));
        intersectAxis(cameraPos.z, rayDir.z);
-        if (tNear > tFar) continue;  // No intersectionn
+    std::cout << "created buffer handler\n";
-        if (tNear < 0.0f) tNear = 0.0f;
+    for (int i = 0; i < 10; i++) {
        FieldData fd = bufferHandler.nextFieldData();
-        float colorR = 0.f, colorG = 0.f, colorB = 0.f;
+        middleOfTwoValues<<<1, 1>>>(ptr_test_read, *bufferHandler.fmd, fd);
        float alphaAccum = 0.f;
        float tCurrent = tNear;
        while (tCurrent < tFar && alphaAccum < 0.99f) {
            Vec3 pos = cameraPos + rayDir * tCurrent;
            // Convert to volume indices
            float fx = pos.x * (d_volumeWidth  - 1);
            float fy = pos.y * (d_volumeHeight - 1);
            float fz = pos.z * (d_volumeDepth  - 1);
            int ix = (int)roundf(fx);
            int iy = (int)roundf(fy);
            int iz = (int)roundf(fz);
            // Sample
            float density = sampleVolumeNearest(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
            // Basic transfer function. TODO: Move to a separate file, and then improve
            float alphaSample = density * 0.05f;
            Vec3 baseColor = Vec3(density, 0.1f*density, 1.f - density);
            // If density ~ 0, skip shading
            if (density > 0.001f) {
                Vec3 grad = computeGradient(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
                Vec3 normal = -grad.normalize();
                Vec3 lightDir = (lightPos - pos).normalize();
                Vec3 viewDir  = -rayDir.normalize();
                // Apply Phong
                Vec3 shadedColor = phongShading(normal, lightDir, viewDir, baseColor);
                // Compose
                colorR     += (1.0f - alphaAccum) * shadedColor.x * alphaSample;
                colorG     += (1.0f - alphaAccum) * shadedColor.y * alphaSample;
                colorB     += (1.0f - alphaAccum) * shadedColor.z * alphaSample;
                alphaAccum += (1.0f - alphaAccum) * alphaSample;
            }
            tCurrent += stepSize;
        }
        accumR += colorR;
        accumG += colorG;
        accumB += colorB;
    }
    // Average samples
    accumR /= (float)SAMPLES_PER_PIXEL;
    accumG /= (float)SAMPLES_PER_PIXEL;
    accumB /= (float)SAMPLES_PER_PIXEL;
    // Final colour
    int fbIndex = (py * imageWidth + px) * 3;
    framebuffer[fbIndex + 0] = (unsigned char)(fminf(accumR, 1.f) * 255);
    framebuffer[fbIndex + 1] = (unsigned char)(fminf(accumG, 1.f) * 255);
    framebuffer[fbIndex + 2] = (unsigned char)(fminf(accumB, 1.f) * 255);
 }
 int main(int argc, char** argv) {
    // Generate debug volume data
    float* hostVolume = new float[VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH];
    generateVolume(hostVolume, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH);
    // Allocate + copy data to GPU
    size_t volumeSize = sizeof(float) * VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH;
    cudaMalloc((void**)&d_volume, volumeSize);
    cudaMemcpy(d_volume, hostVolume, volumeSize, cudaMemcpyHostToDevice);
    int w = VOLUME_WIDTH, h = VOLUME_HEIGHT, d = VOLUME_DEPTH;
    cudaMemcpyToSymbol(d_volumeWidth,  &w, sizeof(int));
    cudaMemcpyToSymbol(d_volumeHeight, &h, sizeof(int));
    cudaMemcpyToSymbol(d_volumeDepth,  &d, sizeof(int));
    // Allocate framebuffer
    unsigned char* d_framebuffer;
    size_t fbSize = IMAGE_WIDTH * IMAGE_HEIGHT * 3 * sizeof(unsigned char);
    cudaMalloc((void**)&d_framebuffer, fbSize);
    cudaMemset(d_framebuffer, 0, fbSize);
    // Camera and Light
    Vec3 cameraPos(0.5, 0.5, -2.0);
    Vec3 cameraDir(0.0, 0.0, 1.0);
    Vec3 cameraUp(0.0, 1.0, 0.0);
    float fov = 60.0f * (M_PI / 180.0f);
    float stepSize = 0.002f;
    Vec3 lightPos(1.5, 2.0, -1.0);
    // Launch kernel
    dim3 blockSize(16, 16);
    dim3 gridSize((IMAGE_WIDTH + blockSize.x - 1)/blockSize.x,
                  (IMAGE_HEIGHT + blockSize.y - 1)/blockSize.y);
    raycastKernel<<<gridSize, blockSize>>>(
        d_volume,
        d_framebuffer,
        IMAGE_WIDTH,
        IMAGE_HEIGHT,
        cameraPos,
        cameraDir.normalize(),
        cameraUp.normalize(),
        fov,
        stepSize,
        lightPos
    );
        cudaDeviceSynchronize();
        std::cout << "ptr_test_read = " << std::fixed << std::setprecision(6) << *ptr_test_read << "\n";
    }
-    // Copy framebuffer back to CPU
+    // TODO: measure data transfer time in this example code.
-    unsigned char* hostFramebuffer = new unsigned char[IMAGE_WIDTH * IMAGE_HEIGHT * 3];
+    cudaFree(ptr_test_read);
    cudaMemcpy(hostFramebuffer, d_framebuffer, fbSize, cudaMemcpyDeviceToHost);
    // Export image
    saveImage("output.ppm", hostFramebuffer, IMAGE_WIDTH, IMAGE_HEIGHT);
    // Cleanup
    delete[] hostVolume;
    delete[] hostFramebuffer;
    cudaFree(d_volume);
    cudaFree(d_framebuffer);
    std::cout << "Phong-DVR rendering done. Image saved to output_phong_dvr.ppm" << std::endl;
    return 0;
 }
--- a/src/tests/a.out
+++ b/src/tests/a.out
--- a/src/tests/cg_example.cu
+++ b/src/tests/cg_example.cu
@@ -1,102 +0,0 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <iostream>
 #include <cmath>
 #include "linalg/linalg.h"
 #include "objs/sphere.h"
 #include "img/handler.h"
 #define WIDTH 3840
 #define HEIGHT 2160
 #define SAMPLES_PER_PIXEL 8
 __device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
    double ambientStrength = 0.1;
    double diffuseStrength = 0.8;
    double specularStrength = 0.5;
    int shininess = 64;
    Vec3 ambient = color * ambientStrength;
    double diff = max(normal.dot(lightDir), 0.0);
    Vec3 diffuse = color * (diffuseStrength * diff);
    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
    return ambient + diffuse + specular;
 }
 __global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= WIDTH || y >= HEIGHT) return;
    int pixelIndex = (y * WIDTH + x) * 3;
    Vec3 rayOrigin(0, 0, 0);
    Vec3 colCum(0, 0, 0);
    double spp = static_cast<double>(SAMPLES_PER_PIXEL);
    for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
        double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
        double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
        Vec3 rayDir(u, v, 1.0);
        rayDir = rayDir.normalize();
        for (int i = 0; i < numSpheres; ++i) {
            double t;
            if (spheres[i].intersect(rayOrigin, rayDir, t)) {
                Vec3 hitPoint = rayOrigin + rayDir * t;
                Vec3 normal = (hitPoint - spheres[i].center).normalize();
                Vec3 lightDir = (lightPos - hitPoint).normalize();
                Vec3 viewDir = -rayDir;
                colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
            }
        }
    }
    // Average color across all samples
    Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
    framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
    framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
    framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
 }
 int main() {
    Sphere spheres[] = {
        { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) },  // Red sphere
        { Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
        { Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) }  // Blue sphere
    };
    int numSpheres = sizeof(spheres) / sizeof(Sphere);
    Vec3 lightPos(5, 5, 0);
    unsigned char* d_framebuffer;
    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
    Sphere* d_spheres;
    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
    cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
    cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
    cudaDeviceSynchronize();
    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
    saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
    cudaFree(d_framebuffer);
    cudaFree(d_spheres);
    delete[] h_framebuffer;
    std::cout << "High-resolution image saved as output.ppm" << std::endl;
    return 0;
 }
--- a/src/tests/deprecated_test.cu
+++ b/src/tests/deprecated_test.cu
@@ -1,128 +0,0 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <iostream>
 #include <fstream>
 #define WIDTH 800
 #define HEIGHT 600
 struct Vec3 {
    double x, y, z;
    __host__ __device__ Vec3() : x(0), y(0), z(0) {}
    __host__ __device__ Vec3(double x, double y, double z) : x(x), y(y), z(z) {}
    __host__ __device__ Vec3 operator+(const Vec3& b) const { return Vec3(x + b.x, y + b.y, z + b.z); }
    __host__ __device__ Vec3 operator-(const Vec3& b) const { return Vec3(x - b.x, y - b.y, z - b.z); }
    __host__ __device__ Vec3 operator*(double b) const { return Vec3(x * b, y * b, z * b); }
    __host__ __device__ Vec3 operator-() const { return Vec3(-x, -y, -z); }
    __host__ __device__ double dot(const Vec3& b) const { return x * b.x + y * b.y + z * b.z; }
    __host__ __device__ Vec3 normalize() const { double len = sqrt(x * x + y * y + z * z); return Vec3(x / len, y / len, z / len); }
 };
 // Simple Phong lighting components
 struct Sphere {
    Vec3 center;
    double radius;
    Vec3 color;
    __device__ bool intersect(const Vec3& rayOrigin, const Vec3& rayDir, double& t) const {
        Vec3 oc = rayOrigin - center;
        double b = oc.dot(rayDir);
        double c = oc.dot(oc) - radius * radius;
        double h = b * b - c;
        if (h < 0.0) return false;
        h = sqrt(h);
        t = -b - h;
        return true;
    }
 };
 __device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
    double ambientStrength = 0.1;
    double diffuseStrength = 0.8;
    double specularStrength = 0.5;
    int shininess = 32;
    // Ambient
    Vec3 ambient = color * ambientStrength;
    // Diffuse
    double diff = max(normal.dot(lightDir), 0.0);
    Vec3 diffuse = color * (diffuseStrength * diff);
    // Specular
    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
    return ambient + diffuse + specular;
 }
 __global__ void renderKernel(unsigned char* framebuffer, Sphere sphere, Vec3 lightPos) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= WIDTH || y >= HEIGHT) return;
    int pixelIndex = (y * WIDTH + x) * 3;
    Vec3 rayOrigin(0, 0, 0);
    Vec3 rayDir((x - WIDTH / 2.0) / WIDTH, (y - HEIGHT / 2.0) / HEIGHT, 1.0);
    rayDir = rayDir.normalize();
    double t;
    if (sphere.intersect(rayOrigin, rayDir, t)) {
        Vec3 hitPoint = rayOrigin + rayDir * t;
        Vec3 normal = (hitPoint - sphere.center).normalize();
        Vec3 lightDir = (lightPos - hitPoint).normalize();
        Vec3 viewDir = -rayDir;
        Vec3 color = phongShading(hitPoint, normal, lightDir, viewDir, sphere.color);
        framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
        framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
        framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
    } else {
        framebuffer[pixelIndex] = 0;
        framebuffer[pixelIndex + 1] = 0;
        framebuffer[pixelIndex + 2] = 0;
    }
 }
 void saveImage(const char* filename, unsigned char* framebuffer) {
    std::ofstream imageFile(filename, std::ios::out | std::ios::binary);
    imageFile << "P6\n" << WIDTH << " " << HEIGHT << "\n255\n";
    for (int i = 0; i < WIDTH * HEIGHT * 3; i++) {
        imageFile << framebuffer[i];
    }
    imageFile.close();
 }
 int main() {
    // Initialize sphere and light source
    Sphere sphere = { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) };  // Red sphere
    Vec3 lightPos(5, 5, 0);
    // Allocate framebuffer on device and host
    unsigned char* d_framebuffer;
    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
    // Launch
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, sphere, lightPos);
    cudaDeviceSynchronize();
    // Copy result back to host and save
    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
    saveImage("output.ppm", h_framebuffer);
    // Clean up
    cudaFree(d_framebuffer);
    delete[] h_framebuffer;
    std::cout << "Image saved as output.ppm" << std::endl;
    return 0;
 }
--- a/src/tests/hello_world
+++ b/src/tests/hello_world
--- a/src/tests/hello_world.cu
+++ b/src/tests/hello_world.cu
@@ -1,16 +0,0 @@
 #include <iostream>
 #include <cuda_runtime.h>
 __global__ void hello_from_gpu() {
    printf("Hello from GPU!\n");
 }
 int main() {
    hello_from_gpu<<<1, 1>>>();
    cudaDeviceSynchronize();
    // Reset device
    cudaDeviceReset();
    return 0;
 }
--- a/src/tests/output.ppm
+++ b/src/tests/output.ppm
--- a/src/tests/test_heavy.cu
+++ b/src/tests/test_heavy.cu
@@ -1,102 +0,0 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <iostream>
 #include <cmath>
 #include "linalg/linalg.h"
 #include "sphere.h"
 #include "img/handler.h"
 #define WIDTH 3840
 #define HEIGHT 2160
 #define SAMPLES_PER_PIXEL 8
 __device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
    double ambientStrength = 0.1;
    double diffuseStrength = 0.8;
    double specularStrength = 0.5;
    int shininess = 64;
    Vec3 ambient = color * ambientStrength;
    double diff = max(normal.dot(lightDir), 0.0);
    Vec3 diffuse = color * (diffuseStrength * diff);
    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
    return ambient + diffuse + specular;
 }
 __global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;
    if (x >= WIDTH || y >= HEIGHT) return;
    int pixelIndex = (y * WIDTH + x) * 3;
    Vec3 rayOrigin(0, 0, 0);
    Vec3 colCum(0, 0, 0);
    double spp = static_cast<double>(SAMPLES_PER_PIXEL);
    for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
        double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
        double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
        Vec3 rayDir(u, v, 1.0);
        rayDir = rayDir.normalize();
        for (int i = 0; i < numSpheres; ++i) {
            double t;
            if (spheres[i].intersect(rayOrigin, rayDir, t)) {
                Vec3 hitPoint = rayOrigin + rayDir * t;
                Vec3 normal = (hitPoint - spheres[i].center).normalize();
                Vec3 lightDir = (lightPos - hitPoint).normalize();
                Vec3 viewDir = -rayDir;
                colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
            }
        }
    }
    // Average color across all samples
    Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
    framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
    framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
    framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
 }
 int main() {
    Sphere spheres[] = {
        { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) },  // Red sphere
        { Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
        { Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) }  // Blue sphere
    };
    int numSpheres = sizeof(spheres) / sizeof(Sphere);
    Vec3 lightPos(5, 5, 0);
    unsigned char* d_framebuffer;
    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
    Sphere* d_spheres;
    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
    cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
    cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
    dim3 threadsPerBlock(16, 16);
    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
    cudaDeviceSynchronize();
    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
    saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
    cudaFree(d_framebuffer);
    cudaFree(d_spheres);
    delete[] h_framebuffer;
    std::cout << "High-resolution image saved as output.ppm" << std::endl;
    return 0;
 }
--- a/test_read.py
+++ b/test_read.py
@@ -0,0 +1,51 @@
 import numpy as np
 from netCDF4 import Dataset
 # Load the NetCDF file
 file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
 ncfile = Dataset(file_path, 'r')
 # Check the available variables in the file
 print(ncfile.variables.keys())
 U = ncfile.variables['T'][:] 
 # Check the shape of the variable 
 print("Shape of U:", U.shape)
 # Compute the mean of the variable across all axes (for all elements in U)
 U_mean = np.mean(U)
 U_sum = np.sum(U)
 # Print the mean
 print("Mean of U:", U_mean)
 print("Sum of U:", U_sum)
 sumval = 0
 row = U[2,20,100]
 print("Shape of row", row.shape)
 print("Row data for first 20 entries", row[0:5])
 row = row[0:5]
 print(f"{type(row)} {row.dtype}")
 n = 0
 for val in row:
    #if not np.ma.is_masked(val):
    n+=1
    sumval += np.float64(val)
 Mean1 = np.mean(row)
 Mean2 = sumval/n
 print(type(Mean1))
 print(type(Mean2))
 print(Mean1)
 print(Mean2)
 print(f"Why does {np.mean(row):.10f} not equal {sumval/n:.10f} ?!")
 # Close the NetCDF file
 ncfile.close()
--- a/test_read2.py
+++ b/test_read2.py
@@ -0,0 +1,18 @@
 import numpy as np
 from netCDF4 import Dataset
 # Load the NetCDF file
 file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
 ncfile = Dataset(file_path, 'r')
 # Check the available variables in the file
 print(ncfile.variables.keys())
 Temp = ncfile.variables['T'][:] 
 print(f"{Temp[1, 20, 100, 100]=}")
 print(f"{Temp.flat[12949732]=}")
 # Close the NetCDF file
 ncfile.close()
--- a/test_read3.py
+++ b/test_read3.py
@@ -0,0 +1,30 @@
 import numpy as np
 from netCDF4 import Dataset
 # file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
 # ncfile = Dataset(file_path, 'r')
 file_paths = [
    'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120101.nc4',
    'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120102.nc4',
    'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120103.nc4'
 ]
 ncfiles = [Dataset(file_path) for file_path in file_paths]
 # print(f"{Temp[0, 20, 100, 100]=}")
 for i in range(10):
    Temp = ncfiles[i//4].variables['T'][:] 
    x = Temp[i%4, 20, 100, 100]
    Temp2 = ncfiles[(i+1)//4].variables['T'][:] 
    y = Temp2[(i+1)%4, 20, 100, 100]
    print(f"{(x+y)/2=}")
 # Close the NetCDF file
 for ncfile in ncfiles:
    ncfile.close()