Merging branches

This commit is contained in:
Martin Opat 2024-12-28 11:36:23 +01:00
commit f0c6141f2c
24 changed files with 1149 additions and 563 deletions

5
.gitignore vendored
View File

@ -2,10 +2,11 @@
.vscode .vscode
# Build # Build
build
build/ build/
build/*
*.o *.o
# Data
data/
# Resulting images # Resulting images
*.ppm *.ppm

6
load-modules.sh Executable file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
# Load the needed modules in Habrok
ml CUDA
ml netCDF-C++4

View File

@ -1,6 +1,7 @@
# Compiler and flags # Compiler and flags
NVCC = nvcc NVCC = nvcc
CXXFLAGS = -I./src -I./linalg -I./img -I./objs -std=c++17 CXXFLAGS = -I./src -I./hurricanedata -std=c++17 $(shell nc-config --cxx4flags) $(shell nc-config --cxx4libs) -g -G
COMPILE_OBJ_FLAGS = --device-c
# Directories # Directories
SRC_DIR = src SRC_DIR = src
@ -8,7 +9,7 @@ BUILD_DIR = build
# Files # Files
TARGET = $(BUILD_DIR)/main TARGET = $(BUILD_DIR)/main
SRC_FILES = $(wildcard $(SRC_DIR)/*.cu) SRC_FILES := $(shell find $(SRC_DIR) -type f \( -name '*.cu' -o -name '*.cpp' \))
OBJ_FILES = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRC_FILES)) OBJ_FILES = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRC_FILES))
# Default target # Default target
@ -16,11 +17,12 @@ all: $(TARGET)
# Build the main target # Build the main target
$(TARGET): $(OBJ_FILES) | $(BUILD_DIR) $(TARGET): $(OBJ_FILES) | $(BUILD_DIR)
$(NVCC) $(CXXFLAGS) -o $@ $^ $(NVCC) $(CXXFLAGS) $^ -o $@
# Compile object files # Compile object files
$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
$(NVCC) $(CXXFLAGS) -c $< -o $@ @mkdir -p $(dir $@)
$(NVCC) $(CXXFLAGS) $(COMPILE_OBJ_FLAGS) -c $< -o $@
# Debug build # Debug build
debug: CXXFLAGS += -g debug: CXXFLAGS += -g

View File

@ -0,0 +1,77 @@
#include "datareader.h"
#include <netcdf>
#include <cassert>
using namespace std;
using namespace netCDF;
DataReader::DataReader(const std::string &path, std::string variableName):
filePathManager(path), variableName(variableName) { }
size_t DataReader::fileLength(size_t fileIndex) {
NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
multimap<string, NcVar> vars = data.getVars();
NcVar var = vars.find(variableName)->second;
size_t length = 1;
for (NcDim dim: var.getDims()) {
length *= dim.getSize();
}
return length;
}
size_t DataReader::axisLength(size_t fileIndex, const std::string& axisName) {
NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
multimap<string, NcVar> vars = data.getVars();
NcVar var = vars.find(axisName)->second;
assert(var.getDimCount() == 1);
netCDF::NcDim dim = var.getDim(0);
return dim.getSize();
}
template <typename T>
void DataReader::loadFile(T* dataOut, size_t fileIndex) {
loadFile(dataOut, fileIndex, variableName);
}
template <typename T>
void DataReader::loadFile(T* dataOut, size_t fileIndex, const string& varName) {
// TODO: We could make the index wrap around so that we dont lose data
// using fileIndex % NUMBEROFFILES here.
NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
multimap<string, NcVar> vars = data.getVars();
NcVar var = vars.find(varName)->second;
var.getVar(dataOut);
}
template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex, const string& variableName);
template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex, const string& variableName);
template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex, const string& variableName);
template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex);
template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex);
template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex);
DataReader::~DataReader() {
}
// template <typename T>
// void DataReader::readAndAllocateAxis(T** axis_ptr, size_t *size, const string &varName) {
// assert(var.getDimCount() == 1);
// netCDF::NcDim dim = var.getDim(0);
// *size = dim.getSize();
// cudaError_t status = cudaMallocManaged(axis_ptr, *size*sizeof(T));
// var.getVar(*axis_ptr);
// }

View File

@ -0,0 +1,56 @@
#ifndef DATAREADER_H
#define DATAREADER_H
#include <string>
#include <experimental/propagate_const>
#include <memory>
#include "filepathmanager.h"
/**
* @brief Simple wrapper around all the NetCDF functionality.
*/
class DataReader {
public:
/**
* @brief Constructor.
* @param path Path to the directory containing all the .nc4 files.
* @param variableName The variable we are interested in.
*/
DataReader(const std::string &path, std::string variableName);
/**
* @brief The length of the flat data in variableName.
* Used for allocating the right amount of memory.
*/
size_t fileLength(size_t fileIndex);
/**
* @brief Write all the data in file fileIndex of variable we re interested in into the dataOut.
*/
template <typename T>
void loadFile(T* dataOut, size_t fileIndex);
/**
* @brief Write all the data in file fileIndex of variable variableName into the dataOut.
* @param dataOut pointer to memory that should be written to.
* @param fileIndex the index of the file we want to load into memory.
* @param variableName the name of the variable
*/
template <typename T>
void loadFile(T* dataOut, size_t fileIndex, const std::string& variableName);
/**
* @brief Get size of a variable.
*/
size_t axisLength(size_t fileIndex, const std::string& axisName);
~DataReader();
private:
FilePathManager filePathManager;
std::string variableName;
};
#endif //DATAREADER_H

View File

@ -0,0 +1,21 @@
#include "fielddata.h"
__device__ float getVal(
const FieldMetadata &md,
const FieldData &d,
const size_t &timeInd,
const size_t &levInd,
const size_t &latInd,
const size_t &lonInd
) {
size_t valArrInd = (d.fieldInd + timeInd) / md.numberOfTimeStepsPerFile;
size_t trueTimeInd = (d.fieldInd + timeInd) % md.numberOfTimeStepsPerFile;
size_t sizeSpatialData = md.widthSize*md.heightSize*md.depthSize;
size_t size2DMapData = md.widthSize*md.heightSize;
return d.valArrays[valArrInd][
trueTimeInd*sizeSpatialData
+ levInd*size2DMapData
+ latInd*md.widthSize
+ lonInd
];
}

View File

@ -0,0 +1,58 @@
#ifndef FIELDDATA_H
#define FIELDDATA_H
#include <vector>
struct FieldMetadata {
size_t widthSize; // Number of different longitudes
size_t heightSize; // Number of different latitudes
size_t depthSize; // Number of different levels
size_t timeSize; // Number of different times
// lons is a managed Unified Memory array of size widthCount that indicates
// that getVal(t, i, j, k) is a value with longitude of lons[i].
// The other such arrays are similarly defined.
double *lons;
double *lats;
double *levs;
int *times;
size_t numberOfTimeStepsPerFile;
};
using FieldMetadata = FieldMetadata;
/**
* @brief Allows for accessing data of a time-slice of a scalar field
* that may start in the middle of a file or go range over multiple files
* by holding references to multiple files at a time.
*
* @note Use the getVal method to index into it and get values.
*/
struct FieldData {
static constexpr size_t FILESNUM = 2; // Number of files stored in a FieldData struct.
static constexpr size_t numberOfTimeStepsPerField = 2;
size_t fieldInd; // Indicates
// A uniform array of length FILESNUM storing pointers to 4D arrays stored in device memory.
float **valArrays;
};
/**
* @brief Get the scalar field value at a particular index (timeInd, levInd, latInd, lonInd).
* Note that the timeInd may be counter-intuitive. See explanation in gpubufferhandler.h.
*
* @return scalar field float value.
*/
extern __device__ float getVal(
const FieldMetadata &md,
const FieldData &d,
const size_t &timeInd,
const size_t &levInd,
const size_t &latInd,
const size_t &lonInd
);
#endif //FIELDDATA_H

View File

@ -0,0 +1,40 @@
#include <iostream>
#include <vector>
#include <string>
#include <filesystem>
#include <algorithm>
#include "filepathmanager.h"
namespace fs = std::filesystem;
FilePathManager::FilePathManager(const std::string& path) {
// Check if the directory exists
if (!fs::exists(path) || !fs::is_directory(path)) {
std::cerr << "Error: Directory does not exist or is not valid." << std::endl;
exit(1);
}
// Iterate over directory entries
for (const auto& entry : fs::directory_iterator(path)) {
if (entry.is_regular_file() && entry.path().extension() == ".nc4") {
fileNames.push_back(entry.path().string());
}
}
// Sort file names lexicographically
std::sort(fileNames.begin(), fileNames.end());
// Print sorted file names
std::cout << "Files in directory '" << path << "' sorted lexicographically:\n";
for (const auto& fileName : fileNames) {
std::cout << fileName << std::endl;
}
}
const char* FilePathManager::getPath(size_t index) const {
return fileNames.at(index).c_str();
}
FilePathManager::~FilePathManager() { }

View File

@ -0,0 +1,28 @@
#ifndef FILEPATHMANAGER_H
#define FILEPATHMANAGER_H
#include <string>
#include <vector>
/**
* @brief Simple class that is responsible for mapping a file index to a file path.
* So for example:
* index = 0 -> MERRA2_400.inst6_3d_ana_Np.20120101.nc4
* index = 1 -> MERRA2_400.inst6_3d_ana_Np.20120102.nc4
* index = 2 -> MERRA2_400.inst6_3d_ana_Np.20120103.nc4
* index = 3 -> MERRA2_400.inst6_3d_ana_Np.20120104.nc4
* etc...
*/
class FilePathManager {
public:
FilePathManager(const std::string& path);
size_t getNumberOfFiles();
const char* getPath(size_t index) const;
~FilePathManager();
private:
std::vector<std::string> fileNames;
};
#endif //FILEPATHMANAGER_H

View File

@ -0,0 +1,277 @@
#include "gpubuffer.h"
#include <mutex>
#include <thread>
#include <queue>
#include <cassert>
#include <condition_variable>
#include <iostream>
#include <future>
#include <variant>
using namespace std;
struct File {
condition_variable cv;
mutex m;
bool valid;
size_t size;
float *h_data; // host data
float *d_data; // device data
};
struct LoadFileJob {
size_t fileIndex;
size_t bufferIndex;
};
struct GetSizeJob {
size_t fileIndex;
promise<size_t> size;
};
struct GetAxisDoubleJob {
size_t fileIndex;
string axisName;
promise<pair<size_t, double *>> axis;
};
struct GetAxisIntJob {
size_t fileIndex;
string axisName;
promise<pair<size_t, int *>> axis;
};
using Job = variant<LoadFileJob, GetSizeJob, GetAxisIntJob, GetAxisDoubleJob>;
class GPUBuffer::impl {
public:
impl(DataReader& dataReader);
void loadFile(LoadFileJob job);
void loadFile(size_t fileIndex, size_t bufferIndex);
void getSize(GetSizeJob job);
size_t getSize(size_t fileIndex); // Most probably blocking
void getAxis(GetAxisDoubleJob job);
void getAxis(GetAxisIntJob job);
template <typename T>
std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
~impl();
File buffer[numBufferedFiles];
// Thread worker things
void worker();
queue<Job> jobs;
unique_ptr<thread> ioworker;
cudaStream_t iostream;
DataReader& dataReader;
condition_variable queuecv;
mutex queuecv_m;
bool workerRunning = true;
};
GPUBuffer::GPUBuffer(DataReader& dataReader): pImpl(make_unique<impl>(dataReader)) { }
size_t GPUBuffer::impl::getSize(size_t fileIndex) {
promise<size_t> promise;
future<size_t> future = promise.get_future();
{
lock_guard<mutex> lk(queuecv_m);
jobs.push(GetSizeJob{fileIndex, move(promise)});
}
queuecv.notify_all();
future.wait();
return future.get();
}
template <typename T>
pair<size_t, T *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
return pImpl->getAxis<T>(fileIndex, axisName);
}
template pair<size_t, int *> GPUBuffer::getAxis<int>(size_t fileIndex, const string& axisName);
template pair<size_t, double *> GPUBuffer::getAxis<double>(size_t fileIndex, const string& axisName);
template <>
pair<size_t, double *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
promise<pair<size_t, double *>> promise;
future<pair<size_t, double *>> future = promise.get_future();
{
lock_guard<mutex> lk(queuecv_m);
jobs.push(GetAxisDoubleJob{fileIndex, axisName, move(promise)});
}
queuecv.notify_all();
future.wait();
return future.get();
}
template pair<size_t, double *> GPUBuffer::impl::getAxis<double>(size_t fileIndex, const string& axisName);
template <>
pair<size_t, int *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
promise<pair<size_t, int *>> promise;
future<pair<size_t, int *>> future = promise.get_future();
{
lock_guard<mutex> lk(queuecv_m);
jobs.push(GetAxisIntJob{fileIndex, axisName, move(promise)});
}
queuecv.notify_all();
future.wait();
return future.get();
}
template pair<size_t, int *> GPUBuffer::impl::getAxis<int>(size_t fileIndex, const string& axisName);
GPUBuffer::impl::impl(DataReader& dataReader): dataReader(dataReader) {
cudaStreamCreate(&iostream);
ioworker = make_unique<thread>([this]() { worker(); });
size_t size = getSize(0);
auto x = getAxis<int>(0, "time");
size_t sizeTime = x.first;
cudaFree(x.second);
for (size_t i = 0; i < numBufferedFiles; i++) {
{
File &file = buffer[i];
lock_guard<mutex> lk(file.m);
cudaMallocHost(&file.h_data, sizeof(float)*size);
cudaMalloc(&file.d_data, sizeof(float)*size);
file.size = size;
file.valid = false;
}
// loadFile(i, i);
}
}
GPUBuffer::~GPUBuffer() {
}
GPUBuffer::impl::~impl() {
{
lock_guard<mutex> lk(queuecv_m);
workerRunning = false;
}
queuecv.notify_all();
ioworker->join();
for (size_t i = 0; i < numBufferedFiles; i++) {
File &file = buffer[i];
cudaFree(file.d_data);
cudaFreeHost(file.h_data);
}
cudaStreamDestroy(iostream);
}
void GPUBuffer::impl::loadFile(LoadFileJob job) {
File &file = buffer[job.bufferIndex];
{
lock_guard<mutex> lk(file.m);
assert(!file.valid);
cout << "loading file with index " << job.fileIndex << "\n";
dataReader.loadFile<float>(file.h_data, job.fileIndex);
cudaMemcpyAsync(file.d_data, file.h_data, sizeof(float)*file.size, cudaMemcpyHostToDevice, iostream);
cudaStreamSynchronize(iostream);
buffer[job.bufferIndex].valid = true;
}
file.cv.notify_all();
}
void GPUBuffer::impl::getSize(GetSizeJob job) {
size_t size = dataReader.fileLength(job.fileIndex);
job.size.set_value(size);
}
void GPUBuffer::impl::getAxis(GetAxisDoubleJob job) {
pair<size_t, double *> array;
array.first = dataReader.axisLength(job.fileIndex, job.axisName);
cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(double));
dataReader.loadFile<double>(array.second, job.fileIndex, job.axisName);
job.axis.set_value(array);
}
void GPUBuffer::impl::getAxis(GetAxisIntJob job) {
pair<size_t, int *> array;
array.first = dataReader.axisLength(job.fileIndex, job.axisName);
cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(int));
dataReader.loadFile<int>(array.second, job.fileIndex, job.axisName);
job.axis.set_value(array);
}
void GPUBuffer::loadFile(size_t fileIndex, size_t bufferIndex) {
pImpl->loadFile(fileIndex, bufferIndex);
}
void GPUBuffer::impl::loadFile(size_t fileIndex, size_t bufferIndex) {
LoadFileJob job = {
.fileIndex = fileIndex,
.bufferIndex = bufferIndex
};
// Main thread theoretically blocks on 2 mutexes here
// but it _should_ never have to wait for them.
{
File &file = buffer[bufferIndex];
std::unique_lock<std::mutex> lk(file.m, std::defer_lock);
bool lockval = lk.try_lock();
if (!lockval) {
cout << "waited on GPUBuffer during loadFile orchestration :(\n";
lk.lock();
}
file.valid = false;
}
{
std::unique_lock<std::mutex> lk(queuecv_m, std::defer_lock);
bool lockval = lk.try_lock();
if (!lockval) {
cout << "waited on IOworker queue during loadFile orchestration :(\n";
lk.lock();
}
jobs.push(job);
}
queuecv.notify_all();
}
void GPUBuffer::impl::worker() {
while(workerRunning) {
Job job;
{
unique_lock<mutex> lk(queuecv_m);
queuecv.wait(lk, [this]{ return !workerRunning || !jobs.empty(); });
if (!workerRunning) {
return;
}
job = move(jobs.front());
jobs.pop();
}
if(holds_alternative<LoadFileJob>(job)) {
loadFile(get<LoadFileJob>(job));
} else if(holds_alternative<GetSizeJob>(job)) {
getSize(move(get<GetSizeJob>(job)));
} else if(holds_alternative<GetAxisDoubleJob>(job)) {
getAxis(move(get<GetAxisDoubleJob>(job)));
} else if(holds_alternative<GetAxisIntJob>(job)) {
getAxis(move(get<GetAxisIntJob>(job)));
}
}
}
DataHandle GPUBuffer::getDataHandle(size_t bufferIndex) {
File &file = pImpl->buffer[bufferIndex];
// TODO: Might be nice to measure the blocking time here.
unique_lock<mutex> lk(file.m);
file.cv.wait(lk, [this, bufferIndex]{ return pImpl->buffer[bufferIndex].valid; });
DataHandle dh = {
.d_data = file.d_data,
.size = file.size
};
return dh;
}

View File

@ -0,0 +1,60 @@
#ifndef GPUBUFFER_H
#define GPUBUFFER_H
#include <string>
#include <memory>
#include <experimental/propagate_const>
#include "datareader.h"
struct DataHandle {
float *d_data; // Device memory
size_t size;
// Could include the data stored in host memory h_data in this handle if it were needed.
};
/**
* @brief Handles the asynchronous (un)loading data to the GPU. The rest of the
* application should not have to interface directly with this class. Getting data
* should go over the GPUBufferHandler class.
*
* @note This class uses a queue to give loading jobs to the worker thread.
* NetCDF-C is not thread safe so you may never read data using netCDF directly
* on any other thread than this worker thread.
*
* Assumes all the data in the nc4 files is the same size.
*
*/
class GPUBuffer {
public:
static constexpr size_t numBufferedFiles = 3; // Number of files stored in memory at one time.
GPUBuffer(DataReader& dataReader);
/**
* @brief Asynchronously tells the GPUBuffer to eventually load a particular file index
* into a buffer index (in which part of the buffer the file should be stored).
*/
void loadFile(size_t fileIndex, size_t bufferIndex); // No blocking
/**
* @brief Get the values stored in a particular buffer index
* @return A struct DataHandle that points to the memory and gives its size.
*/
DataHandle getDataHandle(size_t bufferIndex); // Potentially blocking
/**
* @brief Get the data from an axis (e.g. longitude) and its size.
* @note This is a blocking operation, so it makes a job for the worker
* to read the data and then waits untill the job is completed.
*/
template <typename T>
std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
~GPUBuffer();
private:
class impl;
std::experimental::propagate_const<std::unique_ptr<impl>> pImpl;
};
#endif //GPUBUFFER_H

View File

@ -0,0 +1,93 @@
#include "gpubufferhandler.h"
#include "fielddata.h"
#include <iostream>
using namespace std;
GPUBufferHandler::GPUBufferHandler(GPUBuffer& gpuBuffer):
gpuBuffer(gpuBuffer), fieldInd(0), bufferInd(0), fileInd(0) {
cudaMallocManaged(&fmd, sizeof(FieldMetadata));
auto [widthSize, lons] = gpuBuffer.getAxis<double>(0, "lon");
fmd->widthSize = widthSize;
fmd->lons = lons;
auto [heightSize, lats] = gpuBuffer.getAxis<double>(0, "lat");
fmd->heightSize = heightSize;
fmd->lats = lats;
auto [depthSize, levs] = gpuBuffer.getAxis<double>(0, "lev");
fmd->depthSize = depthSize;
fmd->levs = levs;
for (size_t i = 0; i < GPUBuffer::numBufferedFiles; i++) {
gpuBuffer.loadFile(fileInd,fileInd);
fileInd++;
}
fmd->timeSize = FieldData::numberOfTimeStepsPerField;
cudaMallocManaged(&fmd->times, sizeof(fmd->numberOfTimeStepsPerFile*sizeof(int)));
auto [numberOfTimeStepsPerFile, times] = gpuBuffer.getAxis<int>(0, "time");
fmd->numberOfTimeStepsPerFile = numberOfTimeStepsPerFile;
fmd->times = times;
cudaMallocManaged(&valArrays, sizeof(float *)*FieldData::FILESNUM);
}
FieldData GPUBufferHandler::setupField(size_t newEndBufferInd) {
FieldData fd;
size_t fieldDataInd = 0;
fd.valArrays = valArrays;
cout << "getting field from files " << bufferInd << " to " << newEndBufferInd << " with a field index of " << fieldInd << "\n";
for (int i = bufferInd; i <= newEndBufferInd; i++) {
DataHandle x = gpuBuffer.getDataHandle(i);
fd.valArrays[fieldDataInd] = x.d_data;
fieldDataInd++;
}
fd.fieldInd = fieldInd;
return fd;
}
FieldData GPUBufferHandler::nextFieldData() {
DataHandle x = gpuBuffer.getDataHandle(bufferInd);
size_t newFieldInd = (fieldInd + 1) % fmd->numberOfTimeStepsPerFile;
size_t newBufferInd = (bufferInd + ((fieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
size_t endFieldInd = (fieldInd + FieldData::numberOfTimeStepsPerField - 1) % fmd->numberOfTimeStepsPerFile;
size_t endBufferInd = (bufferInd + (fieldInd + FieldData::numberOfTimeStepsPerField - 1)/fmd->numberOfTimeStepsPerFile) % GPUBuffer::numBufferedFiles;
size_t newEndFieldInd = (endFieldInd + 1) % fmd->numberOfTimeStepsPerFile;
size_t newEndBufferInd = (endBufferInd + ((endFieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
if(firstTimeStep) {
firstTimeStep = false;
return setupField(endBufferInd);
}
fieldInd = newFieldInd;
if (newBufferInd != bufferInd) {
gpuBuffer.loadFile(fileInd, bufferInd);
fileInd++;
bufferInd = newBufferInd;
}
if (newEndBufferInd != endBufferInd) {
// maybe dont do things?
}
return setupField(newEndBufferInd);
}
GPUBufferHandler::~GPUBufferHandler() {
cudaFree(fmd->lons);
cudaFree(fmd->lats);
cudaFree(fmd->levs);
cudaFree(valArrays);
cudaFree(fmd);
}

View File

@ -0,0 +1,64 @@
#ifndef GPUBUFFERHANDLER_H
#define GPUBUFFERHANDLER_H
#include "fielddata.h"
#include "gpubuffer.h"
#include <string>
/**
* @brief Responsible for deciding when the GPUBuffer should load and unload files.
* Also assembles and gives access to FieldData.
*
* You will need to interface with this class.
*/
class GPUBufferHandler {
public:
GPUBufferHandler(GPUBuffer& gpuBuffer);
/**
* @brief Produces a FieldData which can be used to retrieve values for a time-slice
* into a scalar field with a width of FieldData::numberOfTimeStepsPerField.
*
* @details This method always increments the start point of the time-slice
* by 1. See below:
*
* time steps = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 ...
* files (4 time steps per file) = [0 1 2 3][4 5 6 7][8 9 10 11][12 13 ...
* nextFieldData() (1st call) = [0 1]
* nextFieldData() (2nd call) = [1 2]
* nextFieldData() (3rd call) = [2 3]
* nextFieldData() (4th call) = [3 4]
* nextFieldData() (5th call) = [4 5]
* etc...
*
* When getting values from a FieldData using the getVal method,
* the time index is relative to the start of the time-slice.
*
* This means that for d = nextFieldData() (4th call),
* getVal(.., fieldData=d, ..., timeInd = 1, ...) gives a value at
* absolute time step 5 as seen above.
*/
FieldData nextFieldData();
~GPUBufferHandler();
/**
* You can get the FieldMetaData from here.
*/
FieldMetadata *fmd;
static void freeFieldData();
private:
FieldData setupField(size_t endBufferInd);
GPUBuffer& gpuBuffer;
size_t fileInd;
size_t bufferInd;
size_t fieldInd;
bool firstTimeStep = true;
float **valArrays;
};
#endif //GPUBUFFERHANDLER_H

View File

@ -1,224 +1,276 @@
#include <iostream> // #include <iostream>
#include <fstream> // #include <fstream>
#include <cmath> // #include <cmath>
// #include <cuda_runtime.h>
// #include "linalg/linalg.h"
// #include "objs/sphere.h"
// #include "img/handler.h"
// // TODO: Eventually, export this into a better place (i.e., such that we do not have to recompile every time we change a parameter)
// static const int VOLUME_WIDTH = 1024;
// static const int VOLUME_HEIGHT = 1024;
// static const int VOLUME_DEPTH = 1024;
// static const int IMAGE_WIDTH = 2560;
// static const int IMAGE_HEIGHT = 1440;
// static const int SAMPLES_PER_PIXEL = 8; // TODO: Right now uses simple variance, consider using something more advanced (e.g., some commonly-used noise map)
// __constant__ int d_volumeWidth;
// __constant__ int d_volumeHeight;
// __constant__ int d_volumeDepth;
// static float* d_volume = nullptr; // TODO: Adjust according to how data is loaded
// // ----------------------------------------------------------------------------------------------------
// __device__ Vec3 phongShading(const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& baseColor) {
// double ambientStrength = 0.3;
// double diffuseStrength = 0.8;
// double specularStrength = 0.5;
// int shininess = 32;
// Vec3 ambient = baseColor * ambientStrength;
// double diff = fmax(normal.dot(lightDir), 0.0);
// Vec3 diffuse = baseColor * (diffuseStrength * diff);
// Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
// double spec = pow(fmax(viewDir.dot(reflectDir), 0.0), shininess);
// Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
// return ambient + diffuse + specular;
// }
// // Raycast + phong
// __global__ void raycastKernel(float* volumeData, unsigned char* framebuffer, int imageWidth, int imageHeight, Vec3 cameraPos, Vec3 cameraDir, Vec3 cameraUp, float fov, float stepSize, Vec3 lightPos) {
// int px = blockIdx.x * blockDim.x + threadIdx.x;
// int py = blockIdx.y * blockDim.y + threadIdx.y;
// if (px >= imageWidth || py >= imageHeight) return;
// float accumR = 0.0f;
// float accumG = 0.0f;
// float accumB = 0.0f;
// // Multiple samples per pixel
// for (int s = 0; s < SAMPLES_PER_PIXEL; s++) {
// // Map to [-1, 1]
// float u = ((px + 0.5f) / imageWidth ) * 2.0f - 1.0f;
// float v = ((py + 0.5f) / imageHeight) * 2.0f - 1.0f;
// // TODO: Move this (and all similar transformation code) to its own separate file
// float tanHalfFov = tanf(fov * 0.5f);
// u *= tanHalfFov;
// v *= tanHalfFov;
// // Find ray direction
// Vec3 cameraRight = (cameraDir.cross(cameraUp)).normalize();
// cameraUp = (cameraRight.cross(cameraDir)).normalize();
// Vec3 rayDir = (cameraDir + cameraRight*u + cameraUp*v).normalize();
// // Intersect (for simplicity just a 3D box from 0 to 1 in all dimensions) - TODO: Think about whether this is the best way to do this
// float tNear = 0.f; // TODO: These are also linear transforms, so move away
// float tFar = 1e6f;
// auto intersectAxis = [&](float start, float dirVal) {
// if (fabsf(dirVal) < 1e-10f) { // TDDO: Add a constant - epsilon
// if (start < 0.f || start > 1.f) {
// tNear = 1e9f;
// tFar = -1e9f;
// }
// } else {
// float t0 = (0.0f - start) / dirVal; // TODO: 0.0 and 1.0 depend on the box size -> move to a constant
// float t1 = (1.0f - start) / dirVal;
// if (t0>t1) {
// float tmp=t0;
// t0=t1;
// t1=tmp;
// }
// if (t0>tNear) tNear = t0;
// if (t1<tFar ) tFar = t1;
// }
// };
// intersectAxis(cameraPos.x, rayDir.x);
// intersectAxis(cameraPos.y, rayDir.y);
// intersectAxis(cameraPos.z, rayDir.z);
// if (tNear > tFar) continue; // No intersectionn
// if (tNear < 0.0f) tNear = 0.0f;
// float colorR = 0.f, colorG = 0.f, colorB = 0.f;
// float alphaAccum = 0.f;
// float tCurrent = tNear;
// while (tCurrent < tFar && alphaAccum < 0.99f) {
// Vec3 pos = cameraPos + rayDir * tCurrent;
// // Convert to volume indices
// float fx = pos.x * (d_volumeWidth - 1);
// float fy = pos.y * (d_volumeHeight - 1);
// float fz = pos.z * (d_volumeDepth - 1);
// int ix = (int)roundf(fx);
// int iy = (int)roundf(fy);
// int iz = (int)roundf(fz);
// // Sample
// float density = sampleVolumeNearest(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
// // Basic transfer function. TODO: Move to a separate file, and then improve
// float alphaSample = density * 0.05f;
// Vec3 baseColor = Vec3(density, 0.1f*density, 1.f - density);
// // If density ~ 0, skip shading
// if (density > 0.001f) {
// Vec3 grad = computeGradient(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
// Vec3 normal = -grad.normalize();
// Vec3 lightDir = (lightPos - pos).normalize();
// Vec3 viewDir = -rayDir.normalize();
// // Apply Phong
// Vec3 shadedColor = phongShading(normal, lightDir, viewDir, baseColor);
// // Compose
// colorR += (1.0f - alphaAccum) * shadedColor.x * alphaSample;
// colorG += (1.0f - alphaAccum) * shadedColor.y * alphaSample;
// colorB += (1.0f - alphaAccum) * shadedColor.z * alphaSample;
// alphaAccum += (1.0f - alphaAccum) * alphaSample;
// }
// tCurrent += stepSize;
// }
// accumR += colorR;
// accumG += colorG;
// accumB += colorB;
// }
// // Average samples
// accumR /= (float)SAMPLES_PER_PIXEL;
// accumG /= (float)SAMPLES_PER_PIXEL;
// accumB /= (float)SAMPLES_PER_PIXEL;
// // Final colour
// int fbIndex = (py * imageWidth + px) * 3;
// framebuffer[fbIndex + 0] = (unsigned char)(fminf(accumR, 1.f) * 255);
// framebuffer[fbIndex + 1] = (unsigned char)(fminf(accumG, 1.f) * 255);
// framebuffer[fbIndex + 2] = (unsigned char)(fminf(accumB, 1.f) * 255);
// }
// int main(int argc, char** argv) {
// // Generate debug volume data
// float* hostVolume = new float[VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH];
// generateVolume(hostVolume, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH);
// // Allocate + copy data to GPU
// size_t volumeSize = sizeof(float) * VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH;
// cudaMalloc((void**)&d_volume, volumeSize);
// cudaMemcpy(d_volume, hostVolume, volumeSize, cudaMemcpyHostToDevice);
// int w = VOLUME_WIDTH, h = VOLUME_HEIGHT, d = VOLUME_DEPTH;
// cudaMemcpyToSymbol(d_volumeWidth, &w, sizeof(int));
// cudaMemcpyToSymbol(d_volumeHeight, &h, sizeof(int));
// cudaMemcpyToSymbol(d_volumeDepth, &d, sizeof(int));
// // Allocate framebuffer
// unsigned char* d_framebuffer;
// size_t fbSize = IMAGE_WIDTH * IMAGE_HEIGHT * 3 * sizeof(unsigned char);
// cudaMalloc((void**)&d_framebuffer, fbSize);
// cudaMemset(d_framebuffer, 0, fbSize);
// // Camera and Light
// Vec3 cameraPos(0.5, 0.5, -2.0);
// Vec3 cameraDir(0.0, 0.0, 1.0);
// Vec3 cameraUp(0.0, 1.0, 0.0);
// float fov = 60.0f * (M_PI / 180.0f);
// float stepSize = 0.002f;
// Vec3 lightPos(1.5, 2.0, -1.0);
// // Launch kernel
// dim3 blockSize(16, 16);
// dim3 gridSize((IMAGE_WIDTH + blockSize.x - 1)/blockSize.x,
// (IMAGE_HEIGHT + blockSize.y - 1)/blockSize.y);
// raycastKernel<<<gridSize, blockSize>>>(
// d_volume,
// d_framebuffer,
// IMAGE_WIDTH,
// IMAGE_HEIGHT,
// cameraPos,
// cameraDir.normalize(),
// cameraUp.normalize(),
// fov,
// stepSize,
// lightPos
// );
// cudaDeviceSynchronize();
// // Copy framebuffer back to CPU
// unsigned char* hostFramebuffer = new unsigned char[IMAGE_WIDTH * IMAGE_HEIGHT * 3];
// cudaMemcpy(hostFramebuffer, d_framebuffer, fbSize, cudaMemcpyDeviceToHost);
// // Export image
// saveImage("output.ppm", hostFramebuffer, IMAGE_WIDTH, IMAGE_HEIGHT);
// // Cleanup
// delete[] hostVolume;
// delete[] hostFramebuffer;
// cudaFree(d_volume);
// cudaFree(d_framebuffer);
// std::cout << "Phong-DVR rendering done. Image saved to output.ppm" << std::endl;
// return 0;
// }
// gpu-buffer-handler branch main
#include "hurricanedata/fielddata.h"
#include "hurricanedata/gpubufferhandler.h"
#include "hurricanedata/datareader.h"
#include "hurricanedata/gpubuffer.h"
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>
#include <memory>
#include <iomanip>
#include "linalg/linalg.h" __global__ void middleOfTwoValues(float *ans, const FieldMetadata &fmd, FieldData fd) {
#include "objs/sphere.h" float xi = getVal(fmd, fd, 0, 20, 100, 100);
#include "img/handler.h" float yi = getVal(fmd, fd, 1, 20, 100, 100);
*ans = (xi+yi)/2;
// TODO: Eventually, export this into a better place (i.e., such that we do not have to recompile every time we change a parameter)
static const int VOLUME_WIDTH = 1024;
static const int VOLUME_HEIGHT = 1024;
static const int VOLUME_DEPTH = 1024;
static const int IMAGE_WIDTH = 2560;
static const int IMAGE_HEIGHT = 1440;
static const int SAMPLES_PER_PIXEL = 8; // TODO: Right now uses simple variance, consider using something more advanced (e.g., some commonly-used noise map)
__constant__ int d_volumeWidth;
__constant__ int d_volumeHeight;
__constant__ int d_volumeDepth;
static float* d_volume = nullptr; // TODO: Adjust according to how data is loaded
// ----------------------------------------------------------------------------------------------------
__device__ Vec3 phongShading(const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& baseColor) {
double ambientStrength = 0.3;
double diffuseStrength = 0.8;
double specularStrength = 0.5;
int shininess = 32;
Vec3 ambient = baseColor * ambientStrength;
double diff = fmax(normal.dot(lightDir), 0.0);
Vec3 diffuse = baseColor * (diffuseStrength * diff);
Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
double spec = pow(fmax(viewDir.dot(reflectDir), 0.0), shininess);
Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
return ambient + diffuse + specular;
} }
// Raycast + phong int main() {
__global__ void raycastKernel(float* volumeData, unsigned char* framebuffer, int imageWidth, int imageHeight, Vec3 cameraPos, Vec3 cameraDir, Vec3 cameraUp, float fov, float stepSize, Vec3 lightPos) { std::string path = "data/atmosphere_MERRA-wind-speed[179253532]";
int px = blockIdx.x * blockDim.x + threadIdx.x;
int py = blockIdx.y * blockDim.y + threadIdx.y;
if (px >= imageWidth || py >= imageHeight) return;
float accumR = 0.0f; std::string variable = "T";
float accumG = 0.0f;
float accumB = 0.0f;
// Multiple samples per pixel DataReader dataReader{path, variable};
for (int s = 0; s < SAMPLES_PER_PIXEL; s++) {
// Map to [-1, 1]
float u = ((px + 0.5f) / imageWidth ) * 2.0f - 1.0f;
float v = ((py + 0.5f) / imageHeight) * 2.0f - 1.0f;
// TODO: Move this (and all similar transformation code) to its own separate file std::cout << "created datareader\n";
float tanHalfFov = tanf(fov * 0.5f);
u *= tanHalfFov;
v *= tanHalfFov;
// Find ray direction GPUBuffer buffer (dataReader);
Vec3 cameraRight = (cameraDir.cross(cameraUp)).normalize();
cameraUp = (cameraRight.cross(cameraDir)).normalize();
Vec3 rayDir = (cameraDir + cameraRight*u + cameraUp*v).normalize();
// Intersect (for simplicity just a 3D box from 0 to 1 in all dimensions) - TODO: Think about whether this is the best way to do this std::cout << "created buffer\n";
float tNear = 0.f; // TODO: These are also linear transforms, so move away
float tFar = 1e6f;
auto intersectAxis = [&](float start, float dirVal) { GPUBufferHandler bufferHandler(buffer);
if (fabsf(dirVal) < 1e-10f) { // TDDO: Add a constant - epsilon
if (start < 0.f || start > 1.f) {
tNear = 1e9f;
tFar = -1e9f;
}
} else {
float t0 = (0.0f - start) / dirVal; // TODO: 0.0 and 1.0 depend on the box size -> move to a constant
float t1 = (1.0f - start) / dirVal;
if (t0>t1) {
float tmp=t0;
t0=t1;
t1=tmp;
}
if (t0>tNear) tNear = t0;
if (t1<tFar ) tFar = t1;
}
};
intersectAxis(cameraPos.x, rayDir.x); float *ptr_test_read;
intersectAxis(cameraPos.y, rayDir.y); cudaMallocManaged(&ptr_test_read, sizeof(float));
intersectAxis(cameraPos.z, rayDir.z);
if (tNear > tFar) continue; // No intersectionn std::cout << "created buffer handler\n";
if (tNear < 0.0f) tNear = 0.0f; for (int i = 0; i < 10; i++) {
FieldData fd = bufferHandler.nextFieldData();
float colorR = 0.f, colorG = 0.f, colorB = 0.f; middleOfTwoValues<<<1, 1>>>(ptr_test_read, *bufferHandler.fmd, fd);
float alphaAccum = 0.f;
float tCurrent = tNear;
while (tCurrent < tFar && alphaAccum < 0.99f) {
Vec3 pos = cameraPos + rayDir * tCurrent;
// Convert to volume indices
float fx = pos.x * (d_volumeWidth - 1);
float fy = pos.y * (d_volumeHeight - 1);
float fz = pos.z * (d_volumeDepth - 1);
int ix = (int)roundf(fx);
int iy = (int)roundf(fy);
int iz = (int)roundf(fz);
// Sample
float density = sampleVolumeNearest(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
// Basic transfer function. TODO: Move to a separate file, and then improve
float alphaSample = density * 0.05f;
Vec3 baseColor = Vec3(density, 0.1f*density, 1.f - density);
// If density ~ 0, skip shading
if (density > 0.001f) {
Vec3 grad = computeGradient(volumeData, d_volumeWidth, d_volumeHeight, d_volumeDepth, ix, iy, iz);
Vec3 normal = -grad.normalize();
Vec3 lightDir = (lightPos - pos).normalize();
Vec3 viewDir = -rayDir.normalize();
// Apply Phong
Vec3 shadedColor = phongShading(normal, lightDir, viewDir, baseColor);
// Compose
colorR += (1.0f - alphaAccum) * shadedColor.x * alphaSample;
colorG += (1.0f - alphaAccum) * shadedColor.y * alphaSample;
colorB += (1.0f - alphaAccum) * shadedColor.z * alphaSample;
alphaAccum += (1.0f - alphaAccum) * alphaSample;
}
tCurrent += stepSize;
}
accumR += colorR;
accumG += colorG;
accumB += colorB;
}
// Average samples
accumR /= (float)SAMPLES_PER_PIXEL;
accumG /= (float)SAMPLES_PER_PIXEL;
accumB /= (float)SAMPLES_PER_PIXEL;
// Final colour
int fbIndex = (py * imageWidth + px) * 3;
framebuffer[fbIndex + 0] = (unsigned char)(fminf(accumR, 1.f) * 255);
framebuffer[fbIndex + 1] = (unsigned char)(fminf(accumG, 1.f) * 255);
framebuffer[fbIndex + 2] = (unsigned char)(fminf(accumB, 1.f) * 255);
}
int main(int argc, char** argv) {
// Generate debug volume data
float* hostVolume = new float[VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH];
generateVolume(hostVolume, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH);
// Allocate + copy data to GPU
size_t volumeSize = sizeof(float) * VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH;
cudaMalloc((void**)&d_volume, volumeSize);
cudaMemcpy(d_volume, hostVolume, volumeSize, cudaMemcpyHostToDevice);
int w = VOLUME_WIDTH, h = VOLUME_HEIGHT, d = VOLUME_DEPTH;
cudaMemcpyToSymbol(d_volumeWidth, &w, sizeof(int));
cudaMemcpyToSymbol(d_volumeHeight, &h, sizeof(int));
cudaMemcpyToSymbol(d_volumeDepth, &d, sizeof(int));
// Allocate framebuffer
unsigned char* d_framebuffer;
size_t fbSize = IMAGE_WIDTH * IMAGE_HEIGHT * 3 * sizeof(unsigned char);
cudaMalloc((void**)&d_framebuffer, fbSize);
cudaMemset(d_framebuffer, 0, fbSize);
// Camera and Light
Vec3 cameraPos(0.5, 0.5, -2.0);
Vec3 cameraDir(0.0, 0.0, 1.0);
Vec3 cameraUp(0.0, 1.0, 0.0);
float fov = 60.0f * (M_PI / 180.0f);
float stepSize = 0.002f;
Vec3 lightPos(1.5, 2.0, -1.0);
// Launch kernel
dim3 blockSize(16, 16);
dim3 gridSize((IMAGE_WIDTH + blockSize.x - 1)/blockSize.x,
(IMAGE_HEIGHT + blockSize.y - 1)/blockSize.y);
raycastKernel<<<gridSize, blockSize>>>(
d_volume,
d_framebuffer,
IMAGE_WIDTH,
IMAGE_HEIGHT,
cameraPos,
cameraDir.normalize(),
cameraUp.normalize(),
fov,
stepSize,
lightPos
);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
std::cout << "ptr_test_read = " << std::fixed << std::setprecision(6) << *ptr_test_read << "\n";
}
// Copy framebuffer back to CPU // TODO: measure data transfer time in this example code.
unsigned char* hostFramebuffer = new unsigned char[IMAGE_WIDTH * IMAGE_HEIGHT * 3]; cudaFree(ptr_test_read);
cudaMemcpy(hostFramebuffer, d_framebuffer, fbSize, cudaMemcpyDeviceToHost);
// Export image
saveImage("output.ppm", hostFramebuffer, IMAGE_WIDTH, IMAGE_HEIGHT);
// Cleanup
delete[] hostVolume;
delete[] hostFramebuffer;
cudaFree(d_volume);
cudaFree(d_framebuffer);
std::cout << "Phong-DVR rendering done. Image saved to output_phong_dvr.ppm" << std::endl;
return 0; return 0;
} }

Binary file not shown.

View File

@ -1,102 +0,0 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>
#include "linalg/linalg.h"
#include "objs/sphere.h"
#include "img/handler.h"
#define WIDTH 3840
#define HEIGHT 2160
#define SAMPLES_PER_PIXEL 8
__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
double ambientStrength = 0.1;
double diffuseStrength = 0.8;
double specularStrength = 0.5;
int shininess = 64;
Vec3 ambient = color * ambientStrength;
double diff = max(normal.dot(lightDir), 0.0);
Vec3 diffuse = color * (diffuseStrength * diff);
Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
return ambient + diffuse + specular;
}
__global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int pixelIndex = (y * WIDTH + x) * 3;
Vec3 rayOrigin(0, 0, 0);
Vec3 colCum(0, 0, 0);
double spp = static_cast<double>(SAMPLES_PER_PIXEL);
for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
Vec3 rayDir(u, v, 1.0);
rayDir = rayDir.normalize();
for (int i = 0; i < numSpheres; ++i) {
double t;
if (spheres[i].intersect(rayOrigin, rayDir, t)) {
Vec3 hitPoint = rayOrigin + rayDir * t;
Vec3 normal = (hitPoint - spheres[i].center).normalize();
Vec3 lightDir = (lightPos - hitPoint).normalize();
Vec3 viewDir = -rayDir;
colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
}
}
}
// Average color across all samples
Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
}
int main() {
Sphere spheres[] = {
{ Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) }, // Red sphere
{ Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
{ Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) } // Blue sphere
};
int numSpheres = sizeof(spheres) / sizeof(Sphere);
Vec3 lightPos(5, 5, 0);
unsigned char* d_framebuffer;
unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
Sphere* d_spheres;
cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x,
(HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
cudaDeviceSynchronize();
cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
cudaFree(d_framebuffer);
cudaFree(d_spheres);
delete[] h_framebuffer;
std::cout << "High-resolution image saved as output.ppm" << std::endl;
return 0;
}

View File

@ -1,128 +0,0 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <fstream>
#define WIDTH 800
#define HEIGHT 600
struct Vec3 {
double x, y, z;
__host__ __device__ Vec3() : x(0), y(0), z(0) {}
__host__ __device__ Vec3(double x, double y, double z) : x(x), y(y), z(z) {}
__host__ __device__ Vec3 operator+(const Vec3& b) const { return Vec3(x + b.x, y + b.y, z + b.z); }
__host__ __device__ Vec3 operator-(const Vec3& b) const { return Vec3(x - b.x, y - b.y, z - b.z); }
__host__ __device__ Vec3 operator*(double b) const { return Vec3(x * b, y * b, z * b); }
__host__ __device__ Vec3 operator-() const { return Vec3(-x, -y, -z); }
__host__ __device__ double dot(const Vec3& b) const { return x * b.x + y * b.y + z * b.z; }
__host__ __device__ Vec3 normalize() const { double len = sqrt(x * x + y * y + z * z); return Vec3(x / len, y / len, z / len); }
};
// Simple Phong lighting components
struct Sphere {
Vec3 center;
double radius;
Vec3 color;
__device__ bool intersect(const Vec3& rayOrigin, const Vec3& rayDir, double& t) const {
Vec3 oc = rayOrigin - center;
double b = oc.dot(rayDir);
double c = oc.dot(oc) - radius * radius;
double h = b * b - c;
if (h < 0.0) return false;
h = sqrt(h);
t = -b - h;
return true;
}
};
__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
double ambientStrength = 0.1;
double diffuseStrength = 0.8;
double specularStrength = 0.5;
int shininess = 32;
// Ambient
Vec3 ambient = color * ambientStrength;
// Diffuse
double diff = max(normal.dot(lightDir), 0.0);
Vec3 diffuse = color * (diffuseStrength * diff);
// Specular
Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
return ambient + diffuse + specular;
}
__global__ void renderKernel(unsigned char* framebuffer, Sphere sphere, Vec3 lightPos) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int pixelIndex = (y * WIDTH + x) * 3;
Vec3 rayOrigin(0, 0, 0);
Vec3 rayDir((x - WIDTH / 2.0) / WIDTH, (y - HEIGHT / 2.0) / HEIGHT, 1.0);
rayDir = rayDir.normalize();
double t;
if (sphere.intersect(rayOrigin, rayDir, t)) {
Vec3 hitPoint = rayOrigin + rayDir * t;
Vec3 normal = (hitPoint - sphere.center).normalize();
Vec3 lightDir = (lightPos - hitPoint).normalize();
Vec3 viewDir = -rayDir;
Vec3 color = phongShading(hitPoint, normal, lightDir, viewDir, sphere.color);
framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
} else {
framebuffer[pixelIndex] = 0;
framebuffer[pixelIndex + 1] = 0;
framebuffer[pixelIndex + 2] = 0;
}
}
void saveImage(const char* filename, unsigned char* framebuffer) {
std::ofstream imageFile(filename, std::ios::out | std::ios::binary);
imageFile << "P6\n" << WIDTH << " " << HEIGHT << "\n255\n";
for (int i = 0; i < WIDTH * HEIGHT * 3; i++) {
imageFile << framebuffer[i];
}
imageFile.close();
}
int main() {
// Initialize sphere and light source
Sphere sphere = { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) }; // Red sphere
Vec3 lightPos(5, 5, 0);
// Allocate framebuffer on device and host
unsigned char* d_framebuffer;
unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
// Launch
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x,
(HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, sphere, lightPos);
cudaDeviceSynchronize();
// Copy result back to host and save
cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
saveImage("output.ppm", h_framebuffer);
// Clean up
cudaFree(d_framebuffer);
delete[] h_framebuffer;
std::cout << "Image saved as output.ppm" << std::endl;
return 0;
}

Binary file not shown.

View File

@ -1,16 +0,0 @@
#include <iostream>
#include <cuda_runtime.h>
__global__ void hello_from_gpu() {
printf("Hello from GPU!\n");
}
int main() {
hello_from_gpu<<<1, 1>>>();
cudaDeviceSynchronize();
// Reset device
cudaDeviceReset();
return 0;
}

Binary file not shown.

View File

@ -1,102 +0,0 @@
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <cmath>
#include "linalg/linalg.h"
#include "sphere.h"
#include "img/handler.h"
#define WIDTH 3840
#define HEIGHT 2160
#define SAMPLES_PER_PIXEL 8
__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
double ambientStrength = 0.1;
double diffuseStrength = 0.8;
double specularStrength = 0.5;
int shininess = 64;
Vec3 ambient = color * ambientStrength;
double diff = max(normal.dot(lightDir), 0.0);
Vec3 diffuse = color * (diffuseStrength * diff);
Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
return ambient + diffuse + specular;
}
__global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x >= WIDTH || y >= HEIGHT) return;
int pixelIndex = (y * WIDTH + x) * 3;
Vec3 rayOrigin(0, 0, 0);
Vec3 colCum(0, 0, 0);
double spp = static_cast<double>(SAMPLES_PER_PIXEL);
for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
Vec3 rayDir(u, v, 1.0);
rayDir = rayDir.normalize();
for (int i = 0; i < numSpheres; ++i) {
double t;
if (spheres[i].intersect(rayOrigin, rayDir, t)) {
Vec3 hitPoint = rayOrigin + rayDir * t;
Vec3 normal = (hitPoint - spheres[i].center).normalize();
Vec3 lightDir = (lightPos - hitPoint).normalize();
Vec3 viewDir = -rayDir;
colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
}
}
}
// Average color across all samples
Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
}
int main() {
Sphere spheres[] = {
{ Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) }, // Red sphere
{ Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
{ Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) } // Blue sphere
};
int numSpheres = sizeof(spheres) / sizeof(Sphere);
Vec3 lightPos(5, 5, 0);
unsigned char* d_framebuffer;
unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
Sphere* d_spheres;
cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x,
(HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
cudaDeviceSynchronize();
cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
cudaFree(d_framebuffer);
cudaFree(d_spheres);
delete[] h_framebuffer;
std::cout << "High-resolution image saved as output.ppm" << std::endl;
return 0;
}

51
test_read.py Normal file
View File

@ -0,0 +1,51 @@
import numpy as np
from netCDF4 import Dataset
# Load the NetCDF file
file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
ncfile = Dataset(file_path, 'r')
# Check the available variables in the file
print(ncfile.variables.keys())
U = ncfile.variables['T'][:]
# Check the shape of the variable
print("Shape of U:", U.shape)
# Compute the mean of the variable across all axes (for all elements in U)
U_mean = np.mean(U)
U_sum = np.sum(U)
# Print the mean
print("Mean of U:", U_mean)
print("Sum of U:", U_sum)
sumval = 0
row = U[2,20,100]
print("Shape of row", row.shape)
print("Row data for first 20 entries", row[0:5])
row = row[0:5]
print(f"{type(row)} {row.dtype}")
n = 0
for val in row:
#if not np.ma.is_masked(val):
n+=1
sumval += np.float64(val)
Mean1 = np.mean(row)
Mean2 = sumval/n
print(type(Mean1))
print(type(Mean2))
print(Mean1)
print(Mean2)
print(f"Why does {np.mean(row):.10f} not equal {sumval/n:.10f} ?!")
# Close the NetCDF file
ncfile.close()

18
test_read2.py Normal file
View File

@ -0,0 +1,18 @@
import numpy as np
from netCDF4 import Dataset
# Load the NetCDF file
file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
ncfile = Dataset(file_path, 'r')
# Check the available variables in the file
print(ncfile.variables.keys())
Temp = ncfile.variables['T'][:]
print(f"{Temp[1, 20, 100, 100]=}")
print(f"{Temp.flat[12949732]=}")
# Close the NetCDF file
ncfile.close()

30
test_read3.py Normal file
View File

@ -0,0 +1,30 @@
import numpy as np
from netCDF4 import Dataset
# file_path = 'data/MERRA2_400.inst6_3d_ana_Np.20120101.nc4'
# ncfile = Dataset(file_path, 'r')
file_paths = [
'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120101.nc4',
'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120102.nc4',
'data/atmosphere_MERRA-wind-speed[179253532]/MERRA2_400.inst6_3d_ana_Np.20120103.nc4'
]
ncfiles = [Dataset(file_path) for file_path in file_paths]
# print(f"{Temp[0, 20, 100, 100]=}")
for i in range(10):
Temp = ncfiles[i//4].variables['T'][:]
x = Temp[i%4, 20, 100, 100]
Temp2 = ncfiles[(i+1)//4].variables['T'][:]
y = Temp2[(i+1)%4, 20, 100, 100]
print(f"{(x+y)/2=}")
# Close the NetCDF file
for ncfile in ncfiles:
ncfile.close()