working buffering

2025-06-07 02:13:10 +02:00 · 2024-12-26 12:33:07 +01:00
parent 109566d7b6
commit c4a2ce1b44
13 changed files with 187 additions and 155 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 # Compiler and flags
 NVCC = nvcc
-CXXFLAGS = -I./src -I./hurricanedata -std=c++17 $(shell nc-config --cxx4flags) $(shell nc-config --cxx4libs)
+CXXFLAGS = -I./src -I./hurricanedata -std=c++17 $(shell nc-config --cxx4flags) $(shell nc-config --cxx4libs) -g -G
 COMPILE_OBJ_FLAGS = --device-c
 # Directories
--- a/src/hurricanedata/datareader.cu
+++ b/src/hurricanedata/datareader.cu
@@ -43,12 +43,13 @@ void DataReader::loadFile(T* dataOut, size_t fileIndex) {
 }
 template <typename T>
-void DataReader::loadFile(T* dataOut, size_t fileIndex, const string& variableName) {
+void DataReader::loadFile(T* dataOut, size_t fileIndex, const string& varName) {
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
    multimap<string, NcVar> vars = data.getVars();
-    NcVar var = vars.find(variableName)->second;   
+    NcVar var = vars.find(varName)->second;   
    cout << "var = " << varName << "with size = " << var.getDim(0).getSize() << "\n";
    var.getVar(dataOut);
 }
--- a/src/hurricanedata/datareader.h
+++ b/src/hurricanedata/datareader.h
@@ -21,9 +21,6 @@ public:
    size_t axisLength(size_t fileIndex, const std::string& axisName);
    // template <typename T>
    // void readAndAllocateAxis(T** axis_ptr, size_t *size, NcVar var) {
    ~DataReader();
 private:
    FilePathManager filePathManager;
--- a/src/hurricanedata/fielddata.cu
+++ b/src/hurricanedata/fielddata.cu
@@ -8,10 +8,12 @@ __device__ float getVal(
    const size_t &latInd,
    const size_t &lonInd
 ) {
    size_t valArrInd = (d.fieldInd + timeInd) / md.numberOfTimeStepsPerFile;
    size_t trueTimeInd = (d.fieldInd + timeInd) % md.numberOfTimeStepsPerFile;
    size_t sizeSpatialData = md.widthSize*md.heightSize*md.depthSize;
    size_t size2DMapData = md.widthSize*md.heightSize;
-    return d.valArrays[0][
+    return d.valArrays[valArrInd][
-        timeInd*sizeSpatialData
+        trueTimeInd*sizeSpatialData
        + levInd*size2DMapData
        + latInd*md.widthSize
        + lonInd
--- a/src/hurricanedata/fielddata.h
+++ b/src/hurricanedata/fielddata.h
@@ -14,6 +14,10 @@ struct FieldMetadata {
    double *lons; 
    double *lats;
    double *levs;
    size_t timeSize; // Number of different times
    size_t numberOfTimeStepsPerFile;
 };
 using FieldMetadata = FieldMetadata;
@@ -21,16 +25,15 @@ using FieldMetadata = FieldMetadata;
 struct FieldData {
    static constexpr size_t FILESNUM = 2; // Number of files stored in a FieldData struct.
-    // An array of length FILESNUM storing pointers to 4D arrays stored in device memory.
+    // A uniform array of length FILESNUM storing pointers to 4D arrays stored in device memory.
-    float *valArrays[FILESNUM];
+    float **valArrays;
-    size_t timeSize; // Number of different times
+    size_t fieldInd;
-    // times is a managed Unified Memory array of size timeSize that indicates 
+
-    // that getVal(md, d, t, i, j, k) is a value at time times[t].
+    // times is a managed Unified Memory array of size (FILESNUM, numberOfTimeStepsPerFile)
-    int *times;
+    int **times;
 };
 using FieldData = FieldData;
 extern __device__ float getVal(
    const FieldMetadata &md,
--- a/src/hurricanedata/gpubuffer.cu
+++ b/src/hurricanedata/gpubuffer.cu
@@ -18,6 +18,8 @@ struct File {
    size_t size;
    float *h_data; // host data
    float *d_data; // device data
    int *times;
    size_t timeSize;
 };
 struct LoadFileJob {
@@ -53,6 +55,9 @@ public:
    size_t getSize(size_t fileIndex); // Most probably blocking
    void getAxis(GetAxisDoubleJob job);
    void getAxis(GetAxisIntJob job);
    template <typename T>
    std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
    ~impl();
    File buffer[numBufferedFiles];
@@ -84,37 +89,44 @@ size_t GPUBuffer::impl::getSize(size_t fileIndex) {
    return future.get();
 }
-template <>
+template <typename T>
-pair<size_t, double *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
+pair<size_t, T *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
-    promise<pair<size_t, double *>> promise;
+    return pImpl->getAxis<T>(fileIndex, axisName);
    future<pair<size_t, double *>> future = promise.get_future();
    {
        lock_guard<mutex> lk(pImpl->queuecv_m);
        pImpl->jobs.push(GetAxisDoubleJob{fileIndex, axisName, move(promise)});
    }
    pImpl->queuecv.notify_all();
    future.wait();
    return future.get();
 }
 template pair<size_t, int *> GPUBuffer::getAxis<int>(size_t fileIndex, const string& axisName);
 template pair<size_t, double *> GPUBuffer::getAxis<double>(size_t fileIndex, const string& axisName);
 template <>
-pair<size_t, int *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
+pair<size_t, double *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
-    promise<pair<size_t, int *>> promise;
+    promise<pair<size_t, double *>> promise;
-    future<pair<size_t, int *>> future = promise.get_future();
+    future<pair<size_t, double *>> future = promise.get_future();
    {
-        lock_guard<mutex> lk(pImpl->queuecv_m);
+        lock_guard<mutex> lk(queuecv_m);
-        pImpl->jobs.push(GetAxisIntJob{fileIndex, axisName, move(promise)});
+        jobs.push(GetAxisDoubleJob{fileIndex, axisName, move(promise)});
    }
-    pImpl->queuecv.notify_all();
+    queuecv.notify_all();
    future.wait();
    return future.get();
 }
-template pair<size_t, int *> GPUBuffer::getAxis<int>(size_t fileIndex, const string& axisName);
+template pair<size_t, double *> GPUBuffer::impl::getAxis<double>(size_t fileIndex, const string& axisName);
 template <>
 pair<size_t, int *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
    promise<pair<size_t, int *>> promise;
    future<pair<size_t, int *>> future = promise.get_future();
    {
        lock_guard<mutex> lk(queuecv_m);
        jobs.push(GetAxisIntJob{fileIndex, axisName, move(promise)});
    }
    queuecv.notify_all();
    future.wait();
    return future.get();
 }
 template pair<size_t, int *> GPUBuffer::impl::getAxis<int>(size_t fileIndex, const string& axisName);
 GPUBuffer::impl::impl(DataReader& dataReader): dataReader(dataReader) {
    cudaStreamCreate(&iostream);
@@ -122,34 +134,26 @@ GPUBuffer::impl::impl(DataReader& dataReader): dataReader(dataReader) {
    ioworker = make_unique<thread>([this]() { worker(); });
    size_t size = getSize(0);
-    cout << "size = " << size << "\n";
+    auto x = getAxis<int>(0, "time");
-
+    size_t sizeTime = x.first;
    cudaFree(x.second);
    for (size_t i = 0; i < numBufferedFiles; i++) {
        {
            File &file = buffer[i];
            lock_guard<mutex> lk(file.m);
            cudaMallocHost(&file.h_data, sizeof(float)*size);
-            cudaError_t status = cudaMalloc(&file.d_data, sizeof(float)*size);
+            cudaMalloc(&file.d_data, sizeof(float)*size);
-            if (status != cudaSuccess)
+            cudaMallocManaged(&file.times, sizeof(int)*sizeTime);
                cerr << "Error allocating device memory. Status code: " << status << "\n";
            file.size = size;
            file.valid = false;
            file.timeSize = sizeTime;
        }
-        loadFile(i, i);
+        // loadFile(i, i);
        {
            // lock_guard<mutex> lk(queuecv_m);
            // LoadFileJob job = {
            //     .fileIndex = i,
            //     .bufferIndex = i
            // };
            // cout << "enqueue file load job\n";
            // jobs.push(job);
        }
    }
 }
-GPUBuffer::~GPUBuffer() { }
+GPUBuffer::~GPUBuffer() {
 }
 GPUBuffer::impl::~impl() {
    {
@@ -162,6 +166,7 @@ GPUBuffer::impl::~impl() {
        File &file = buffer[i];
        cudaFree(file.d_data);
        cudaFree(file.h_data);
        cudaFree(file.times);
    }
    cudaStreamDestroy(iostream);
 }
@@ -172,11 +177,12 @@ void GPUBuffer::impl::loadFile(LoadFileJob job) {
    {
        lock_guard<mutex> lk(file.m);
        assert(!file.valid);
        dataReader.loadFile<int>(file.times, job.fileIndex, "time"); // TODO: Times dont store anything useful :(
        cout << "times[1] (inside inside) " << file.times[1]  << " for file with fileindex = " << job.fileIndex <<  "\n";
        dataReader.loadFile<float>(file.h_data, job.fileIndex);
        cudaMemcpyAsync(file.d_data, file.h_data, sizeof(float)*file.size, cudaMemcpyHostToDevice, iostream);
        cudaStreamSynchronize(iostream);
        buffer[job.bufferIndex].valid = true;
        cout << "loaded file with index" << job.bufferIndex << "\n";
    }
    file.cv.notify_all();
 }
@@ -190,7 +196,7 @@ void GPUBuffer::impl::getAxis(GetAxisDoubleJob job) {
    pair<size_t, double *> array;
    array.first = dataReader.axisLength(job.fileIndex, job.axisName);
    cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(double));
-    dataReader.loadFile<double>(array.second, job.fileIndex);
+    dataReader.loadFile<double>(array.second, job.fileIndex, job.axisName);
    job.axis.set_value(array);
 }
@@ -272,6 +278,8 @@ DataHandle GPUBuffer::getDataHandle(size_t bufferIndex) {
    DataHandle dh = {
        .d_data = file.d_data,
        .times = file.times,
        .timeSize = file.timeSize,
        .size = file.size
    };
    return dh;
--- a/src/hurricanedata/gpubuffer.h
+++ b/src/hurricanedata/gpubuffer.h
@@ -8,7 +8,9 @@
 #include "datareader.h"
 struct DataHandle {
-    float *d_data;
+    float *d_data; // Device memory
    int* times; // Uniform memory
    size_t timeSize;
    size_t size;
 };
--- a/src/hurricanedata/gpubufferhandler.cu
+++ b/src/hurricanedata/gpubufferhandler.cu
@@ -1,60 +1,93 @@
 #include "gpubufferhandler.h"
 #include "fielddata.h"
 #include "gpubufferhelper.h"
-#include <netcdf>
+#include <iostream>
 using namespace std;
 using namespace netCDF;
 GPUBufferHandler::GPUBufferHandler(const std::string &path, std::string variableName):
 filePathManager(path), variableName(variableName), presentTimeIndex(0), fileIndex(0) {
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
    multimap<string, NcVar> vars = data.getVars();
 GPUBufferHandler::GPUBufferHandler(GPUBuffer& gpuBuffer):
 gpuBuffer(gpuBuffer), fieldInd(0), bufferInd(0), fileInd(0) {
    cudaMallocManaged(&fmd, sizeof(FieldMetadata));
-    readAndAllocateAxis<double>(&fmd->lons, &fmd->widthSize, vars.find("lon")->second);
+    auto [widthSize, lons] = gpuBuffer.getAxis<double>(0, "lon");
-    readAndAllocateAxis<double>(&fmd->lats, &fmd->heightSize, vars.find("lat")->second);
+    fmd->widthSize = widthSize;
-    readAndAllocateAxis<double>(&fmd->levs, &fmd->depthSize, vars.find("lev")->second);
+    fmd->lons = lons;
    auto [heightSize, lats] = gpuBuffer.getAxis<double>(0, "lat");
    fmd->heightSize = heightSize;
    fmd->lats = lats;
    auto [depthSize, levs] = gpuBuffer.getAxis<double>(0, "lev");
    fmd->depthSize = depthSize;
    fmd->levs = levs;
    for (size_t i = 0; i < GPUBuffer::numBufferedFiles; i++) {
        gpuBuffer.loadFile(fileInd,fileInd);
        fileInd++;
    }
    fmd->numberOfTimeStepsPerFile = 4; // TODO: Maybe find a better way to do this.
    fmd->timeSize = GPUBufferHandler::numberOfTimeStepsPerField;
 }
 FieldData GPUBufferHandler::setupField(size_t newEndBufferInd) {
    FieldData fd;
    cudaMallocManaged(&fd.valArrays, sizeof(sizeof(float *)*FieldData::FILESNUM));
    cudaMallocManaged(&fd.times, sizeof(sizeof(int *)*FieldData::FILESNUM));
    size_t fieldDataInd = 0;
    cout << "getting field from files " << bufferInd  << " to " << newEndBufferInd << "\n";
    for (int i = bufferInd; i <= newEndBufferInd; i++) {
        cout << "getting handle for " << i << "\n";
        DataHandle x = gpuBuffer.getDataHandle(i);
        fd.valArrays[fieldDataInd] = x.d_data;
        fd.times[fieldDataInd] = x.times;
        fieldDataInd++;
    }
    fd.fieldInd = fieldInd;
    return fd;
 }
 FieldData GPUBufferHandler::nextFieldData() {
    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
-    multimap<string, NcVar> vars = data.getVars();
+    DataHandle x = gpuBuffer.getDataHandle(bufferInd);
    size_t newFieldInd = (fieldInd + 1) % fmd->numberOfTimeStepsPerFile;
    size_t newBufferInd = (bufferInd + ((fieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
-    FieldData fd;
+    size_t endFieldInd = (fieldInd + GPUBufferHandler::numberOfTimeStepsPerField - 1) % fmd->numberOfTimeStepsPerFile;
-    size_t timeSize;
+    size_t endBufferInd = (bufferInd + (fieldInd + GPUBufferHandler::numberOfTimeStepsPerField - 1)/fmd->numberOfTimeStepsPerFile) % GPUBuffer::numBufferedFiles;
    readAndAllocateAxis(&fd.times, &fd.timeSize, vars.find("time")->second);
-    NcVar var = vars.find(variableName)->second;   
+    size_t newEndFieldInd = (endFieldInd + 1) % fmd->numberOfTimeStepsPerFile;
    size_t newEndBufferInd = (endBufferInd + ((endFieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
-    int length = 1;
+    // size_t newBufferInd = (bufferInd + 1) % GPUBuffer::numBufferedFiles;
-    for (NcDim dim: var.getDims()) {
+    // size_t newFieldInd = (fieldInd + ((bufferInd + 1) / 4)) % x.timeSize;
-        length *= dim.getSize();
+
    // size_t endBufferInd = (bufferInd + GPUBufferHandler::numberOfTimeStepsPerField) % GPUBuffer::numBufferedFiles;
    // size_t endFieldInd = (fieldInd + ((bufferInd + GPUBufferHandler::numberOfTimeStepsPerField) / 4)) % x.timeSize;
    // size_t newEndBufferInd = (endBufferInd + 1) % GPUBuffer::numBufferedFiles;
    // size_t newEndFieldInd = (endFieldInd + ((endBufferInd + 1) / 4)) % x.timeSize;
    if(firstTimeStep) {
        firstTimeStep = false;
        return setupField(endFieldInd);
    } 
-    // Store NetCDF variable in pinned memory on host
+    if (newBufferInd != bufferInd) {
-    float *h_array;
+        fileInd++;
        gpuBuffer.loadFile(fileInd, bufferInd);
        bufferInd = newBufferInd;
        fieldInd = newFieldInd;
    }
-    cudaMallocHost(&h_array, sizeof(float)*length);
+    if (newEndBufferInd != endBufferInd) {
        // maybe dont do things?
    }
-    var.getVar(h_array);
+    return setupField(newEndBufferInd);
    // Copy data to device
    cudaError_t status = cudaMalloc(&fd.valArrays[0], sizeof(float)*length);
    if (status != cudaSuccess)
        cout << "Error allocating memory: " << status << "\n";
    cudaMemcpyAsync(fd.valArrays[0], h_array, sizeof(float)*length, cudaMemcpyHostToDevice);
    cudaDeviceSynchronize(); // Heavy hammer synchronisation // TODO: Use streams
    cudaFreeHost(h_array);
    return fd;
 }
 GPUBufferHandler::~GPUBufferHandler() {
--- a/src/hurricanedata/gpubufferhandler.h
+++ b/src/hurricanedata/gpubufferhandler.h
@@ -8,7 +8,7 @@
 class GPUBufferHandler {
 public:
-    GPUBufferHandler();
+    GPUBufferHandler(GPUBuffer& gpuBuffer);
    FieldData nextFieldData();
@@ -16,8 +16,15 @@ public:
    FieldMetadata *fmd;
    static constexpr size_t numberOfTimeStepsPerField = 2; // TODO: Move this to fielddata
 private:
-    GPUBuffer 
+    FieldData setupField(size_t endBufferInd);
    GPUBuffer& gpuBuffer;
    size_t fileInd;
    size_t bufferInd;
    size_t fieldInd;
    bool firstTimeStep = true;
 };
 #endif //GPUBUFFERHANDLER_H
--- a/src/hurricanedata/gpubufferhelper.h
+++ b/src/hurricanedata/gpubufferhelper.h
@@ -1,14 +0,0 @@
 #include <netcdf>
 #include <cassert>
 using namespace std;
 using namespace netCDF;
 template <typename T>
 void readAndAllocateAxis(T** axis_ptr, size_t *size, NcVar var) {
    assert(var.getDimCount() == 1);
    netCDF::NcDim dim = var.getDim(0);
    *size = dim.getSize();
    cudaError_t status = cudaMallocManaged(axis_ptr, *size*sizeof(T));
    var.getVar(*axis_ptr);
 }
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,5 +1,5 @@
 // #include "hurricanedata/fielddata.h"
-// #include "hurricanedata/gpubufferhandler.h"
+#include "hurricanedata/gpubufferhandler.h"
 #include "hurricanedata/datareader.h"
 #include "hurricanedata/gpubuffer.h"
@@ -10,32 +10,9 @@
 #include <memory>
 #include <iomanip> 
-
+__global__ void getSingleValue(float *ans, const FieldMetadata &fmd, FieldData fd) {
-// Not parallel computation
+    float xi = getVal(fmd, fd, 1, 20, 100, 100);
-// __global__ void computeMean(float *ans, const FieldMetadata &fmd, FieldData fd) {
+    *ans = xi;
 //     float sum = 0;
 //     size_t num_not_masked_values = 0;
 //     for (int i = 0; i < fmd.widthSize; i++) {
 //         double xi = getVal(fmd, fd, 2, 20, 100, i);
 //         if (xi < 1E14) { /* If x is not missing value */
 //             num_not_masked_values++;
 //             sum += xi;
 //         }
 //     }
 //     *ans = sum/num_not_masked_values;
 // }
 __global__ void computeMean(float *ans, DataHandle dh) {
    float sum = 0;
    size_t num_not_masked_values = 0;
    for (int i = 0; i < dh.size; i++) {
        double xi = dh.d_data[i];
        if (xi < 1E14) { /* If x is not missing value */
            num_not_masked_values++;
            sum += xi;
        }
    }
    *ans = sum/num_not_masked_values;
 }
 int main() {
@@ -52,29 +29,26 @@ int main() {
    std::cout << "created buffer\n";
-    auto dataHandle = buffer.getDataHandle(0);
+    GPUBufferHandler bufferHandler(buffer);
-    std::cout << "got a data handle\n";
+    std::cout << "created buffer handler\n";
-    auto x = buffer.getAxis<int>(0, "time");
+    auto fd = bufferHandler.nextFieldData();
    std::cout << "size of x=" << x.first << "\n";
    std::cout << "x[1]= " <<x.second[1] << "\n";
    std::cout << "aquired field\n";
-    // GPUBufferHandler buffer{path, variable};
+    float *ptr_test_read;
    cudaMallocManaged(&ptr_test_read, sizeof(float));
-    // auto fd = buffer.nextFieldData();
+    getSingleValue<<<1, 1>>>(ptr_test_read, *bufferHandler.fmd, fd);
    float *ptr_mean;
    cudaMallocManaged(&ptr_mean, sizeof(float));
    computeMean<<<1, 1>>>(ptr_mean, dataHandle);
    cudaDeviceSynchronize();
-    std::cout << "Mean = " << std::fixed << std::setprecision(6) << *ptr_mean << "\n";
+    std::cout << "ptr_test_read = " << std::fixed << std::setprecision(6) << *ptr_test_read << "\n";
-    // cudaFree(fd.valArrays[0]);
+    // TODO: Write a example loop using buffering and measure it.
-    cudaFree(ptr_mean);
+
    cudaFree(fd.valArrays[0]); // TODO: Free data properly in FieldData (maybe make an iterator)
    cudaFree(ptr_test_read);
    return 0;
 }
--- a/test_read.py
+++ b/test_read.py
@@ -46,5 +46,6 @@ print(Mean2)
 print(f"Why does {np.mean(row):.10f} not equal {sumval/n:.10f} ?!")
 # Close the NetCDF file
 ncfile.close()
--- a/test_read2.py
+++ b/test_read2.py
@@ -0,0 +1,18 @@
 import numpy as np
 from netCDF4 import Dataset
 # Load the NetCDF file
 file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
 ncfile = Dataset(file_path, 'r')
 # Check the available variables in the file
 print(ncfile.variables.keys())
 Temp = ncfile.variables['T'][:] 
 print(f"{Temp[1, 20, 100, 100]=}")
 print(f"{Temp.flat[12949732]=}")
 # Close the NetCDF file
 ncfile.close()