diff --git a/.gitignore b/.gitignore
index 44ce3ac..1c3a1df 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,11 @@
 .vscode
 
 # Build
-build
 build/
-build/*
+*.o
 
+# Data
+data/
+
+# Resulting images
+*.ppm
diff --git a/README.md b/README.md
index 15c50f1..45b5a9e 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,22 @@
-# cuda-based-raytrace
\ No newline at end of file
+# cuda-based-raytrace
+
+## How to run
+If necessary clean previous build:
+```bash
+make clean
+```
+
+Then build (currently always debug build):
+```bash
+make all
+```
+
+Run:
+```bash
+./build/main
+```
+
+To open the resulting image you can use (Ubuntu):
+```bash
+xdg-open output.ppm
+```
\ No newline at end of file
diff --git a/load-modules.sh b/load-modules.sh
new file mode 100755
index 0000000..c6fcab4
--- /dev/null
+++ b/load-modules.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# Load the needed modules in Habrok
+
+ml CUDA
+ml netCDF-C++4
diff --git a/makefile b/makefile
index 18993ff..90d9c3c 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,7 @@
 # Compiler and flags
 NVCC = nvcc
-CXXFLAGS = -I./src -I./linalg -I./img -I./objs -std=c++17
+CXXFLAGS = -I./src -std=c++17 $(shell nc-config --cxx4flags) $(shell nc-config --cxx4libs) -g -G
+COMPILE_OBJ_FLAGS = --device-c
 
 # Directories
 SRC_DIR = src
@@ -8,7 +9,7 @@ BUILD_DIR = build
 
 # Files
 TARGET = $(BUILD_DIR)/main
-SRC_FILES = $(wildcard $(SRC_DIR)/*.cu)
+SRC_FILES := $(shell find $(SRC_DIR) -type f \( -name '*.cu' -o -name '*.cpp' \))
 OBJ_FILES = $(patsubst $(SRC_DIR)/%.cu,$(BUILD_DIR)/%.o,$(SRC_FILES))
 
 # Default target
@@ -16,11 +17,12 @@ all: $(TARGET)
 
 # Build the main target
 $(TARGET): $(OBJ_FILES) | $(BUILD_DIR)
-	$(NVCC) $(CXXFLAGS) -o $@ $^
+	$(NVCC) $(CXXFLAGS) $^ -o $@
 
 # Compile object files
 $(BUILD_DIR)/%.o: $(SRC_DIR)/%.cu
-	$(NVCC) $(CXXFLAGS) -c $< -o $@
+	@mkdir -p $(dir $@)
+	$(NVCC) $(CXXFLAGS) $(COMPILE_OBJ_FLAGS) -c $< -o $@
 
 # Debug build
 debug: CXXFLAGS += -g
@@ -34,4 +36,4 @@ clean:
 $(BUILD_DIR):
 	mkdir -p $(BUILD_DIR)
 
-.PHONY: all clean debug
+.PHONY: all clean debug
\ No newline at end of file
diff --git a/output.ppm b/output.ppm
deleted file mode 100644
index 1d7b780..0000000
Binary files a/output.ppm and /dev/null differ
diff --git a/src/consts.cu b/src/consts.cu
new file mode 100644
index 0000000..18cf40a
--- /dev/null
+++ b/src/consts.cu
@@ -0,0 +1,18 @@
+#include "consts.h"
+
+__device__ Point3 d_cameraPos;
+__device__ Vec3 d_cameraDir;
+__device__ Vec3 d_cameraUp;
+__device__ Point3 d_lightPos;
+
+Point3 h_cameraPos = Point3::init(-0.7, -1.0, -2.0);
+Vec3 h_cameraDir = Vec3::init(0.4, 0.6, 1.0).normalize();
+Vec3 h_cameraUp = Vec3::init(0.0, 1.0, 0.0).normalize();
+Point3 h_lightPos = Point3::init(1.5, 2.0, -1.0);
+
+void copyConstantsToDevice() {
+    cudaMemcpyToSymbol(d_cameraPos, &h_cameraPos, sizeof(Point3));
+    cudaMemcpyToSymbol(d_cameraDir, &h_cameraDir, sizeof(Vec3));
+    cudaMemcpyToSymbol(d_cameraUp, &h_cameraUp, sizeof(Vec3));
+    cudaMemcpyToSymbol(d_lightPos, &h_lightPos, sizeof(Point3));
+}
\ No newline at end of file
diff --git a/src/consts.h b/src/consts.h
new file mode 100644
index 0000000..774c297
--- /dev/null
+++ b/src/consts.h
@@ -0,0 +1,44 @@
+#ifndef CONSTS_H
+#define CONSTS_H
+
+#include "linalg/vec.h"
+#include <cmath>
+
+// --------------------------- Basic Constants ---------------------------
+const int VOLUME_WIDTH  = 49;
+const int VOLUME_HEIGHT = 51;
+const int VOLUME_DEPTH  = 42;
+
+const int IMAGE_WIDTH   = 2560;
+const int IMAGE_HEIGHT  = 1440;
+
+const double epsilon = 1e-10f;
+const double infty   = 1e15f;  // This vlalue is used to represent missing values in data
+
+
+// --------------------------- Raycasting Constants ---------------------------
+const int SAMPLES_PER_PIXEL = 8;  // TODO: Right now uses simple variance, consider using something more advanced (e.g., some commonly-used noise map)
+
+const float alphaAcumLimit = 0.65f;   // TODO: Idk what a good accumulation value is
+const float minAllowedDensity = 0.001f;
+
+const float stepSize = 0.002f;
+
+
+// --------------------------- Illumination Constants ---------------------------
+const double ambientStrength  = 0.3;
+const double diffuseStrength  = 0.8;
+const double specularStrength = 0.5;
+const int shininess           = 32;
+const float fov             = 60.0f * (M_PI / 180.0f);
+
+// Camera and Light
+extern __device__ Point3 d_cameraPos;
+extern __device__ Vec3 d_cameraDir;
+extern __device__ Vec3 d_cameraUp;
+extern __device__ Point3 d_lightPos;
+
+// --------------------------- Functions for handling external constants ---------------------------
+void copyConstantsToDevice();
+
+#endif // CONSTS_H
\ No newline at end of file
diff --git a/src/hurricanedata/datareader.cu b/src/hurricanedata/datareader.cu
new file mode 100644
index 0000000..f62f316
--- /dev/null
+++ b/src/hurricanedata/datareader.cu
@@ -0,0 +1,77 @@
+#include "datareader.h"
+
+#include <netcdf>
+#include <cassert>
+
+using namespace std;
+using namespace netCDF;
+
+DataReader::DataReader(const std::string &path, std::string variableName):
+filePathManager(path), variableName(variableName) { }
+
+size_t DataReader::fileLength(size_t fileIndex) {
+    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
+
+    multimap<string, NcVar> vars = data.getVars();
+
+    NcVar var = vars.find(variableName)->second;   
+    
+    size_t length = 1;
+    for (NcDim dim: var.getDims()) {
+        length *= dim.getSize();
+    }
+
+    return length;
+}
+
+size_t DataReader::axisLength(size_t fileIndex, const std::string& axisName) {
+    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
+
+    multimap<string, NcVar> vars = data.getVars();
+
+    NcVar var = vars.find(axisName)->second;   
+
+    assert(var.getDimCount() == 1);
+
+    netCDF::NcDim dim = var.getDim(0);
+    return dim.getSize();
+}
+
+template <typename T>
+void DataReader::loadFile(T* dataOut, size_t fileIndex) {
+    loadFile(dataOut, fileIndex, variableName);
+}
+
+template <typename T>
+void DataReader::loadFile(T* dataOut, size_t fileIndex, const string& varName) {
+    // TODO: We could make the index wrap around so that we dont lose data
+    // using fileIndex % NUMBEROFFILES here.
+
+    NcFile data(filePathManager.getPath(fileIndex), NcFile::read);
+
+    multimap<string, NcVar> vars = data.getVars();
+
+    NcVar var = vars.find(varName)->second;   
+
+    var.getVar(dataOut);
+}
+
+template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex, const string& variableName);
+template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex, const string& variableName);
+template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex, const string& variableName);
+template void DataReader::loadFile<double>(double* dataOut, size_t fileIndex);
+template void DataReader::loadFile<float>(float* dataOut, size_t fileIndex);
+template void DataReader::loadFile<int>(int* dataOut, size_t fileIndex);
+
+DataReader::~DataReader() {
+
+}
+
+// template <typename T>
+// void DataReader::readAndAllocateAxis(T** axis_ptr, size_t *size, const string &varName) {
+//     assert(var.getDimCount() == 1);
+//     netCDF::NcDim dim = var.getDim(0);
+//     *size = dim.getSize();
+//     cudaError_t status = cudaMallocManaged(axis_ptr, *size*sizeof(T));
+//     var.getVar(*axis_ptr);
+// }
\ No newline at end of file
diff --git a/src/hurricanedata/datareader.h b/src/hurricanedata/datareader.h
new file mode 100644
index 0000000..9c82aea
--- /dev/null
+++ b/src/hurricanedata/datareader.h
@@ -0,0 +1,56 @@
+#ifndef DATAREADER_H
+#define DATAREADER_H
+
+#include <string>
+#include <experimental/propagate_const>
+#include <memory>
+
+#include "filepathmanager.h"
+
+
+/**
+ * @brief Simple wrapper around all the NetCDF functionality.
+ */
+class DataReader {
+public:
+    /**
+     * @brief Constructor.
+     * @param path Path to the directory containing all the .nc4 files.
+     * @param variableName The variable we are interested in.
+     */
+    DataReader(const std::string &path, std::string variableName);
+
+    /**
+     * @brief The length of the flat data in variableName.
+     * Used for allocating the right amount of memory.
+     */
+    size_t fileLength(size_t fileIndex);
+
+    /**
+     * @brief Write all the data in file fileIndex of variable we re interested in into the dataOut.
+     */
+    template <typename T>
+    void loadFile(T* dataOut, size_t fileIndex);
+
+    /**
+     * @brief Write all the data in file fileIndex of variable variableName into the dataOut.
+     * @param dataOut pointer to memory that should be written to.
+     * @param fileIndex the index of the file we want to load into memory.
+     * @param variableName the name of the variable
+     */
+    template <typename T>
+    void loadFile(T* dataOut, size_t fileIndex, const std::string& variableName);
+
+    /**
+     * @brief Get size of a variable.
+     */
+    size_t axisLength(size_t fileIndex, const std::string& axisName);
+
+    ~DataReader();
+private:
+    FilePathManager filePathManager;
+    std::string variableName;
+};
+
+
+#endif //DATAREADER_H
diff --git a/src/hurricanedata/fielddata.cu b/src/hurricanedata/fielddata.cu
new file mode 100644
index 0000000..38b0d25
--- /dev/null
+++ b/src/hurricanedata/fielddata.cu
@@ -0,0 +1,21 @@
+#include "fielddata.h"
+
+__device__ float getVal(
+    const FieldMetadata &md,
+    const FieldData &d,
+    const size_t &timeInd,
+    const size_t &levInd,
+    const size_t &latInd,
+    const size_t &lonInd
+) {
+    size_t valArrInd = (d.fieldInd + timeInd) / md.numberOfTimeStepsPerFile;
+    size_t trueTimeInd = (d.fieldInd + timeInd) % md.numberOfTimeStepsPerFile;
+    size_t sizeSpatialData = md.widthSize*md.heightSize*md.depthSize;
+    size_t size2DMapData = md.widthSize*md.heightSize;
+    return d.valArrays[valArrInd][
+        trueTimeInd*sizeSpatialData
+        + levInd*size2DMapData
+        + latInd*md.widthSize
+        + lonInd
+    ]; 
+}
\ No newline at end of file
diff --git a/src/hurricanedata/fielddata.h b/src/hurricanedata/fielddata.h
new file mode 100644
index 0000000..4a3257c
--- /dev/null
+++ b/src/hurricanedata/fielddata.h
@@ -0,0 +1,58 @@
+#ifndef FIELDDATA_H
+#define FIELDDATA_H
+
+#include <vector>
+
+struct FieldMetadata {
+    size_t widthSize; // Number of different longitudes
+    size_t heightSize; // Number of different latitudes
+    size_t depthSize; // Number of different levels
+    size_t timeSize; // Number of different times
+
+    // lons is a managed Unified Memory array of size widthCount that indicates 
+    // that getVal(t, i, j, k) is a value with longitude of lons[i].
+    // The other such arrays are similarly defined.
+    double *lons; 
+    double *lats;
+    double *levs;
+    int *times;
+
+    size_t numberOfTimeStepsPerFile;
+};
+
+using FieldMetadata = FieldMetadata;
+
+/**
+ * @brief Allows for accessing data of a time-slice of a scalar field
+ * that may start in the middle of a file or go range over multiple files
+ * by holding references to multiple files at a time.
+ * 
+ * @note Use the getVal method to index into it and get values.
+ */
+struct FieldData {
+    static constexpr size_t FILESNUM = 2; // Number of files stored in a FieldData struct.
+    static constexpr size_t numberOfTimeStepsPerField = 2;
+
+    size_t fieldInd; // Indicates 
+
+    // A uniform array of length FILESNUM storing pointers to 4D arrays stored in device memory.
+    float **valArrays;
+
+};
+
+/**
+ * @brief Get the scalar field value at a particular index (timeInd, levInd, latInd, lonInd).
+ * Note that the timeInd may be counter-intuitive. See explanation in gpubufferhandler.h.
+ * 
+ * @return scalar field float value.
+ */
+extern __device__ float getVal(
+    const FieldMetadata &md,
+    const FieldData &d,
+    const size_t &timeInd,
+    const size_t &levInd,
+    const size_t &latInd,
+    const size_t &lonInd
+);
+
+#endif //FIELDDATA_H
diff --git a/src/hurricanedata/filepathmanager.cpp b/src/hurricanedata/filepathmanager.cpp
new file mode 100644
index 0000000..03c7f39
--- /dev/null
+++ b/src/hurricanedata/filepathmanager.cpp
@@ -0,0 +1,40 @@
+#include <iostream>
+#include <vector>
+#include <string>
+#include <filesystem>
+#include <algorithm>
+
+#include "filepathmanager.h"
+
+namespace fs = std::filesystem;
+
+FilePathManager::FilePathManager(const std::string& path) {
+    // Check if the directory exists
+    if (!fs::exists(path) || !fs::is_directory(path)) {
+        std::cerr << "Error: Directory does not exist or is not valid." << std::endl;
+        exit(1);
+    }
+
+    // Iterate over directory entries
+    for (const auto& entry : fs::directory_iterator(path)) {
+        if (entry.is_regular_file() && entry.path().extension() == ".nc4") {
+            fileNames.push_back(entry.path().string());
+        }
+    }
+
+    // Sort file names lexicographically
+    std::sort(fileNames.begin(), fileNames.end());
+
+    // Print sorted file names
+    std::cout << "Files in directory '" << path << "' sorted lexicographically:\n";
+    for (const auto& fileName : fileNames) {
+        std::cout << fileName << std::endl;
+    }
+}
+
+const char* FilePathManager::getPath(size_t index) const {
+    return fileNames.at(index).c_str();
+}
+
+
+FilePathManager::~FilePathManager() { }
\ No newline at end of file
diff --git a/src/hurricanedata/filepathmanager.h b/src/hurricanedata/filepathmanager.h
new file mode 100644
index 0000000..96f9ec5
--- /dev/null
+++ b/src/hurricanedata/filepathmanager.h
@@ -0,0 +1,28 @@
+#ifndef FILEPATHMANAGER_H
+#define FILEPATHMANAGER_H
+
+#include <string>
+#include <vector>
+
+/**
+ * @brief Simple class that is responsible for mapping a file index to a file path.
+ * So for example:
+ * index = 0 -> MERRA2_400.inst6_3d_ana_Np.20120101.nc4
+ * index = 1 -> MERRA2_400.inst6_3d_ana_Np.20120102.nc4
+ * index = 2 -> MERRA2_400.inst6_3d_ana_Np.20120103.nc4
+ * index = 3 -> MERRA2_400.inst6_3d_ana_Np.20120104.nc4
+ * etc...
+ */
+class FilePathManager {
+public:
+    FilePathManager(const std::string& path);
+    size_t getNumberOfFiles();
+    const char* getPath(size_t index) const;
+
+    ~FilePathManager();
+
+private:
+    std::vector<std::string> fileNames;
+};
+
+#endif //FILEPATHMANAGER_H
\ No newline at end of file
diff --git a/src/hurricanedata/gpubuffer.cu b/src/hurricanedata/gpubuffer.cu
new file mode 100644
index 0000000..7edac38
--- /dev/null
+++ b/src/hurricanedata/gpubuffer.cu
@@ -0,0 +1,277 @@
+#include "gpubuffer.h"
+
+#include <mutex>
+#include <thread>
+#include <queue>
+#include <cassert>
+#include <condition_variable>
+#include <iostream>
+#include <future>
+#include <variant>
+
+using namespace std;
+
+struct File {
+    condition_variable cv;
+    mutex m;
+    bool valid;
+    size_t size;
+    float *h_data; // host data
+    float *d_data; // device data
+};
+
+struct LoadFileJob {
+    size_t fileIndex;
+    size_t bufferIndex;
+};
+
+struct GetSizeJob {
+    size_t fileIndex;
+    promise<size_t> size;
+};
+
+struct GetAxisDoubleJob {
+    size_t fileIndex;
+    string axisName;
+    promise<pair<size_t, double *>> axis;
+};
+
+struct GetAxisIntJob {
+    size_t fileIndex;
+    string axisName;
+    promise<pair<size_t, int *>> axis;
+};
+
+using Job = variant<LoadFileJob, GetSizeJob, GetAxisIntJob, GetAxisDoubleJob>;
+
+class GPUBuffer::impl {
+public:
+    impl(DataReader& dataReader);
+    void loadFile(LoadFileJob job);
+    void loadFile(size_t fileIndex, size_t bufferIndex);
+    void getSize(GetSizeJob job);
+    size_t getSize(size_t fileIndex); // Most probably blocking
+    void getAxis(GetAxisDoubleJob job);
+    void getAxis(GetAxisIntJob job);
+
+    template <typename T>
+    std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
+    ~impl();
+
+    File buffer[numBufferedFiles];
+
+    // Thread worker things
+    void worker();
+    queue<Job> jobs;
+    unique_ptr<thread> ioworker;
+    cudaStream_t iostream;
+    DataReader& dataReader;
+    condition_variable queuecv;
+    mutex queuecv_m;
+    bool workerRunning = true;
+};
+
+GPUBuffer::GPUBuffer(DataReader& dataReader): pImpl(make_unique<impl>(dataReader)) { }
+
+size_t GPUBuffer::impl::getSize(size_t fileIndex) {
+    promise<size_t> promise;
+    future<size_t> future = promise.get_future();
+    {
+        lock_guard<mutex> lk(queuecv_m);
+        jobs.push(GetSizeJob{fileIndex, move(promise)});
+    }
+    queuecv.notify_all();
+
+    future.wait();
+
+    return future.get();
+}
+
+template <typename T>
+pair<size_t, T *> GPUBuffer::getAxis(size_t fileIndex, const string& axisName) {
+    return pImpl->getAxis<T>(fileIndex, axisName);
+}
+template pair<size_t, int *> GPUBuffer::getAxis<int>(size_t fileIndex, const string& axisName);
+template pair<size_t, double *> GPUBuffer::getAxis<double>(size_t fileIndex, const string& axisName);
+
+template <>
+pair<size_t, double *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
+    promise<pair<size_t, double *>> promise;
+    future<pair<size_t, double *>> future = promise.get_future();
+    {
+        lock_guard<mutex> lk(queuecv_m);
+        jobs.push(GetAxisDoubleJob{fileIndex, axisName, move(promise)});
+    }
+    queuecv.notify_all();
+
+    future.wait();
+
+    return future.get();
+}
+template pair<size_t, double *> GPUBuffer::impl::getAxis<double>(size_t fileIndex, const string& axisName);
+
+template <>
+pair<size_t, int *> GPUBuffer::impl::getAxis(size_t fileIndex, const string& axisName) {
+    promise<pair<size_t, int *>> promise;
+    future<pair<size_t, int *>> future = promise.get_future();
+    {
+        lock_guard<mutex> lk(queuecv_m);
+        jobs.push(GetAxisIntJob{fileIndex, axisName, move(promise)});
+    }
+    queuecv.notify_all();
+
+    future.wait();
+
+    return future.get();
+}
+template pair<size_t, int *> GPUBuffer::impl::getAxis<int>(size_t fileIndex, const string& axisName);
+
+GPUBuffer::impl::impl(DataReader& dataReader): dataReader(dataReader) {
+    cudaStreamCreate(&iostream);
+
+    ioworker = make_unique<thread>([this]() { worker(); });
+
+    size_t size = getSize(0);
+    auto x = getAxis<int>(0, "time");
+    size_t sizeTime = x.first;
+    cudaFree(x.second);
+    for (size_t i = 0; i < numBufferedFiles; i++) {
+        {
+            File &file = buffer[i];
+            lock_guard<mutex> lk(file.m);
+            cudaMallocHost(&file.h_data, sizeof(float)*size);
+            cudaMalloc(&file.d_data, sizeof(float)*size);
+            file.size = size;
+            file.valid = false;
+        }
+        // loadFile(i, i);
+    }
+}
+
+GPUBuffer::~GPUBuffer() {
+}
+
+GPUBuffer::impl::~impl() {
+    {
+        lock_guard<mutex> lk(queuecv_m);
+        workerRunning = false;
+    }
+    queuecv.notify_all();
+    ioworker->join();
+    for (size_t i = 0; i < numBufferedFiles; i++) {
+        File &file = buffer[i];
+        cudaFree(file.d_data);
+        cudaFreeHost(file.h_data);
+    }
+    cudaStreamDestroy(iostream);
+}
+
+void GPUBuffer::impl::loadFile(LoadFileJob job) {
+    File &file = buffer[job.bufferIndex];
+    {
+        lock_guard<mutex> lk(file.m);
+        assert(!file.valid);
+        cout << "loading file with index " << job.fileIndex << "\n";
+        dataReader.loadFile<float>(file.h_data, job.fileIndex);
+        cudaMemcpyAsync(file.d_data, file.h_data, sizeof(float)*file.size, cudaMemcpyHostToDevice, iostream);
+        cudaStreamSynchronize(iostream);
+        buffer[job.bufferIndex].valid = true;
+    }
+    file.cv.notify_all();
+}
+
+void GPUBuffer::impl::getSize(GetSizeJob job) {
+    size_t size = dataReader.fileLength(job.fileIndex);
+    job.size.set_value(size);
+}
+
+void GPUBuffer::impl::getAxis(GetAxisDoubleJob job) {
+    pair<size_t, double *> array;
+    array.first = dataReader.axisLength(job.fileIndex, job.axisName);
+    cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(double));
+    dataReader.loadFile<double>(array.second, job.fileIndex, job.axisName);
+    job.axis.set_value(array);
+}
+
+void GPUBuffer::impl::getAxis(GetAxisIntJob job) {
+    pair<size_t, int *> array;
+    array.first = dataReader.axisLength(job.fileIndex, job.axisName);
+    cudaError_t status = cudaMallocManaged(&array.second, array.first*sizeof(int));
+    dataReader.loadFile<int>(array.second, job.fileIndex, job.axisName);
+    job.axis.set_value(array);
+}
+
+void GPUBuffer::loadFile(size_t fileIndex, size_t bufferIndex) {
+    pImpl->loadFile(fileIndex, bufferIndex);
+}
+
+void GPUBuffer::impl::loadFile(size_t fileIndex, size_t bufferIndex) {
+    LoadFileJob job = {
+        .fileIndex = fileIndex,
+        .bufferIndex = bufferIndex
+    };
+
+    // Main thread theoretically blocks on 2 mutexes here
+    // but it _should_ never have to wait for them.
+    {
+        File &file = buffer[bufferIndex];
+
+        std::unique_lock<std::mutex> lk(file.m, std::defer_lock);
+        bool lockval = lk.try_lock();
+        if (!lockval) {
+            cout << "waited on GPUBuffer during loadFile orchestration :(\n";
+            lk.lock();
+        }
+        file.valid = false;
+    }
+    {
+        std::unique_lock<std::mutex> lk(queuecv_m, std::defer_lock);
+        bool lockval = lk.try_lock();
+        if (!lockval) {
+            cout << "waited on IOworker queue during loadFile orchestration :(\n";
+            lk.lock();
+        }
+        jobs.push(job);
+    }
+    queuecv.notify_all();
+}
+
+void GPUBuffer::impl::worker() {
+    while(workerRunning) {
+        Job job;
+        {
+            unique_lock<mutex> lk(queuecv_m);
+            queuecv.wait(lk, [this]{ return !workerRunning || !jobs.empty(); });
+            if (!workerRunning) {
+                return;
+            }
+
+            job = move(jobs.front());
+            jobs.pop();
+        }
+        if(holds_alternative<LoadFileJob>(job)) {
+            loadFile(get<LoadFileJob>(job));
+        } else if(holds_alternative<GetSizeJob>(job)) {
+            getSize(move(get<GetSizeJob>(job)));
+        } else if(holds_alternative<GetAxisDoubleJob>(job)) {
+            getAxis(move(get<GetAxisDoubleJob>(job)));
+        } else if(holds_alternative<GetAxisIntJob>(job)) {
+            getAxis(move(get<GetAxisIntJob>(job)));
+        }
+
+    }
+}
+
+DataHandle GPUBuffer::getDataHandle(size_t bufferIndex) {
+    File &file = pImpl->buffer[bufferIndex];
+
+    // TODO: Might be nice to measure the blocking time here.
+    unique_lock<mutex> lk(file.m);
+    file.cv.wait(lk, [this, bufferIndex]{ return  pImpl->buffer[bufferIndex].valid; });
+
+    DataHandle dh = {
+        .d_data = file.d_data,
+        .size = file.size
+    };
+    return dh;
+}
\ No newline at end of file
diff --git a/src/hurricanedata/gpubuffer.h b/src/hurricanedata/gpubuffer.h
new file mode 100644
index 0000000..b953abe
--- /dev/null
+++ b/src/hurricanedata/gpubuffer.h
@@ -0,0 +1,60 @@
+#ifndef GPUBUFFER_H
+#define GPUBUFFER_H
+
+#include <string>
+#include <memory>
+#include <experimental/propagate_const>
+
+#include "datareader.h"
+
+struct DataHandle {
+    float *d_data; // Device memory
+    size_t size;
+    // Could include the data stored in host memory h_data in this handle if it were needed.
+};
+
+/**
+ * @brief Handles the asynchronous (un)loading data to the GPU. The rest of the
+ * application should not have to interface directly with this class. Getting data
+ * should go over the GPUBufferHandler class.
+ * 
+ * @note This class uses a queue to give loading jobs to the worker thread.
+ * NetCDF-C is not thread safe so you may never read data using netCDF directly
+ * on any other thread than this worker thread.
+ * 
+ * Assumes all the data in the nc4 files is the same size.
+ * 
+ */
+class GPUBuffer {
+public:
+    static constexpr size_t numBufferedFiles = 3; // Number of files stored in memory at one time.
+
+    GPUBuffer(DataReader& dataReader);
+
+    /**
+     * @brief Asynchronously tells the GPUBuffer to eventually load a particular file index
+     * into a buffer index (in which part of the buffer the file should be stored).
+     */
+    void loadFile(size_t fileIndex, size_t bufferIndex); // No blocking
+
+    /**
+     * @brief Get the values stored in a particular buffer index
+     * @return A struct DataHandle that points to the memory and gives its size.
+     */
+    DataHandle getDataHandle(size_t bufferIndex); // Potentially blocking
+
+    /**
+     * @brief Get the data from an axis (e.g. longitude) and its size.
+     * @note This is a blocking operation, so it makes a job for the worker
+     * to read the data and then waits untill the job is completed.
+     */
+    template <typename T>
+    std::pair<size_t, T *> getAxis(size_t fileIndex, const std::string& axisName); // Most probably blocking
+
+    ~GPUBuffer();
+private:
+    class impl;
+    std::experimental::propagate_const<std::unique_ptr<impl>> pImpl;
+};
+
+#endif //GPUBUFFER_H
diff --git a/src/hurricanedata/gpubufferhandler.cu b/src/hurricanedata/gpubufferhandler.cu
new file mode 100644
index 0000000..606a47c
--- /dev/null
+++ b/src/hurricanedata/gpubufferhandler.cu
@@ -0,0 +1,93 @@
+#include "gpubufferhandler.h"
+#include "fielddata.h"
+
+#include <iostream>
+
+using namespace std;
+
+GPUBufferHandler::GPUBufferHandler(GPUBuffer& gpuBuffer):
+gpuBuffer(gpuBuffer), fieldInd(0), bufferInd(0), fileInd(0) {
+    cudaMallocManaged(&fmd, sizeof(FieldMetadata));
+
+    auto [widthSize, lons] = gpuBuffer.getAxis<double>(0, "lon");
+    fmd->widthSize = widthSize;
+    fmd->lons = lons;
+
+    auto [heightSize, lats] = gpuBuffer.getAxis<double>(0, "lat");
+    fmd->heightSize = heightSize;
+    fmd->lats = lats;
+
+    auto [depthSize, levs] = gpuBuffer.getAxis<double>(0, "lev");
+    fmd->depthSize = depthSize;
+    fmd->levs = levs;
+
+    for (size_t i = 0; i < GPUBuffer::numBufferedFiles; i++) {
+        gpuBuffer.loadFile(fileInd,fileInd);
+        fileInd++;
+    }
+
+    fmd->timeSize = FieldData::numberOfTimeStepsPerField;
+
+    cudaMallocManaged(&fmd->times, sizeof(fmd->numberOfTimeStepsPerFile*sizeof(int)));
+
+    auto [numberOfTimeStepsPerFile, times] = gpuBuffer.getAxis<int>(0, "time");
+    fmd->numberOfTimeStepsPerFile = numberOfTimeStepsPerFile;
+    fmd->times = times;
+
+    cudaMallocManaged(&valArrays, sizeof(float *)*FieldData::FILESNUM);
+}
+
+FieldData GPUBufferHandler::setupField(size_t newEndBufferInd) {
+    
+    FieldData fd;
+    size_t fieldDataInd = 0;
+    fd.valArrays = valArrays;
+    cout << "getting field from files " << bufferInd  << " to " << newEndBufferInd << " with a field index of " << fieldInd << "\n";
+    for (int i = bufferInd; i <= newEndBufferInd; i++) {
+        DataHandle x = gpuBuffer.getDataHandle(i);
+        fd.valArrays[fieldDataInd] = x.d_data;
+        fieldDataInd++;
+    }
+    fd.fieldInd = fieldInd;
+
+    return fd;
+}
+
+FieldData GPUBufferHandler::nextFieldData() {
+    DataHandle x = gpuBuffer.getDataHandle(bufferInd);
+    size_t newFieldInd = (fieldInd + 1) % fmd->numberOfTimeStepsPerFile;
+    size_t newBufferInd = (bufferInd + ((fieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
+
+    size_t endFieldInd = (fieldInd + FieldData::numberOfTimeStepsPerField - 1) % fmd->numberOfTimeStepsPerFile;
+    size_t endBufferInd = (bufferInd + (fieldInd + FieldData::numberOfTimeStepsPerField - 1)/fmd->numberOfTimeStepsPerFile) % GPUBuffer::numBufferedFiles;
+
+    size_t newEndFieldInd = (endFieldInd + 1) % fmd->numberOfTimeStepsPerFile;
+    size_t newEndBufferInd = (endBufferInd + ((endFieldInd + 1) / fmd->numberOfTimeStepsPerFile)) % GPUBuffer::numBufferedFiles;
+
+    if(firstTimeStep) {
+        firstTimeStep = false;
+        return setupField(endBufferInd);
+    } 
+
+    fieldInd = newFieldInd;
+
+    if (newBufferInd != bufferInd) {
+        gpuBuffer.loadFile(fileInd, bufferInd);
+        fileInd++;
+        bufferInd = newBufferInd;
+    }
+
+    if (newEndBufferInd != endBufferInd) {
+        // maybe dont do things?
+    }
+
+    return setupField(newEndBufferInd);
+}
+
+GPUBufferHandler::~GPUBufferHandler() {
+    cudaFree(fmd->lons);
+    cudaFree(fmd->lats);
+    cudaFree(fmd->levs);
+    cudaFree(valArrays);
+    cudaFree(fmd);
+}
\ No newline at end of file
diff --git a/src/hurricanedata/gpubufferhandler.h b/src/hurricanedata/gpubufferhandler.h
new file mode 100644
index 0000000..2b4338b
--- /dev/null
+++ b/src/hurricanedata/gpubufferhandler.h
@@ -0,0 +1,64 @@
+#ifndef GPUBUFFERHANDLER_H
+#define GPUBUFFERHANDLER_H
+
+#include "fielddata.h"
+#include "gpubuffer.h"
+
+#include <string>
+
+/**
+ * @brief Responsible for deciding when the GPUBuffer should load and unload files.
+ * Also assembles and gives access to FieldData.
+ * 
+ * You will need to interface with this class.
+ */
+class GPUBufferHandler {
+public:
+    GPUBufferHandler(GPUBuffer& gpuBuffer);
+
+    /**
+     * @brief Produces a FieldData which can be used to retrieve values for a time-slice
+     * into a scalar field with a width of FieldData::numberOfTimeStepsPerField.
+     * 
+     * @details This method always increments the start point of the time-slice
+     * by 1. See below:
+     * 
+     * time steps                    =  0  1  2  3  4  5  6  7  8  9  10  11  12 13 ...
+     * files (4 time steps per file) = [0  1  2  3][4  5  6  7][8  9  10  11][12 13 ...
+     * nextFieldData() (1st call)    = [0  1]
+     * nextFieldData() (2nd call)    =    [1  2]
+     * nextFieldData() (3rd call)    =       [2  3]
+     * nextFieldData() (4th call)    =          [3  4]
+     * nextFieldData() (5th call)    =             [4  5]
+     * etc...
+     *  
+     * When getting values from a FieldData using the getVal method, 
+     * the time index is relative to the start of the time-slice.
+     * 
+     * This means that for d = nextFieldData() (4th call),
+     * getVal(.., fieldData=d, ..., timeInd = 1, ...) gives a value at 
+     * absolute time step 5 as seen above.
+     */
+    FieldData nextFieldData();
+
+    ~GPUBufferHandler();
+
+    /**
+     * You can get the FieldMetaData from here.
+     */
+    FieldMetadata *fmd;
+
+    static void freeFieldData();
+
+private:
+    FieldData setupField(size_t endBufferInd);
+    GPUBuffer& gpuBuffer;
+    size_t fileInd;
+    size_t bufferInd;
+    size_t fieldInd;
+    bool firstTimeStep = true;
+
+    float **valArrays;
+};
+
+#endif //GPUBUFFERHANDLER_H
diff --git a/src/illumination/illumination.h b/src/illumination/illumination.h
new file mode 100644
index 0000000..6e9b6f0
--- /dev/null
+++ b/src/illumination/illumination.h
@@ -0,0 +1,7 @@
+#ifndef ILLUMINATION_H
+#define ILLUMINATION_H
+
+#include "raycaster.h"
+#include "shading.h"
+
+#endif // ILLUMINATION_H
\ No newline at end of file
diff --git a/src/illumination/raycaster.h b/src/illumination/raycaster.h
new file mode 100644
index 0000000..a1c4292
--- /dev/null
+++ b/src/illumination/raycaster.h
@@ -0,0 +1,126 @@
+#ifndef RAYCASTER_H
+#define RAYCASTER_H
+
+#include <cuda_runtime.h>
+#include "linalg/linalg.h"
+#include "consts.h"
+#include "shading.h"
+
+
+// Raycast + phong, TODO: Consider wrapping in a class
+__global__ void raycastKernel(float* volumeData, unsigned char* framebuffer) {
+    int px = blockIdx.x * blockDim.x + threadIdx.x;
+    int py = blockIdx.y * blockDim.y + threadIdx.y;
+    if (px >= IMAGE_WIDTH || py >= IMAGE_HEIGHT) return;
+
+    float accumR = 0.0f;
+    float accumG = 0.0f;
+    float accumB = 0.0f;
+
+    // Multiple samples per pixel
+    for (int s = 0; s < SAMPLES_PER_PIXEL; s++) {
+        // Map to [-1, 1]
+        float u = ((px + 0.5f) / IMAGE_WIDTH ) * 2.0f - 1.0f;
+        float v = ((py + 0.5f) / IMAGE_HEIGHT) * 2.0f - 1.0f;
+
+        // TODO: Move this (and all similar transformation code) to its own separate file
+        float tanHalfFov = tanf(fov * 0.5f);
+        u *= tanHalfFov;
+        v *= tanHalfFov;
+
+        // Find ray direction
+        Vec3 cameraRight = (d_cameraDir.cross(d_cameraUp)).normalize();
+        d_cameraUp = (cameraRight.cross(d_cameraDir)).normalize();
+        Vec3 rayDir = (d_cameraDir + cameraRight*u + d_cameraUp*v).normalize();
+
+        // Intersect (for simplicity just a 3D box from 0 to 1 in all dimensions) - TODO: Think about whether this is the best way to do this
+        float tNear = 0.0f;
+        float tFar  = 1e6f;
+        auto intersectAxis = [&](float start, float dirVal) {
+            if (fabsf(dirVal) < epsilon) {
+                if (start < 0.f || start > 1.f) {
+                    tNear = 1e9f;
+                    tFar  = -1e9f;
+                }
+            } else {
+                float t0 = (0.0f - start) / dirVal;
+                float t1 = (1.0f - start) / dirVal;
+                if (t0>t1) { 
+                    float tmp=t0; 
+                    t0=t1; 
+                    t1=tmp; 
+                }
+                if (t0>tNear) tNear = t0;
+                if (t1<tFar ) tFar  = t1;
+            }
+        };
+
+        intersectAxis(d_cameraPos.x, rayDir.x);
+        intersectAxis(d_cameraPos.y, rayDir.y);
+        intersectAxis(d_cameraPos.z, rayDir.z);
+
+        if (tNear > tFar) continue;  // No intersectionn
+        if (tNear < 0.0f) tNear = 0.0f;
+
+        float colorR = 0.0f, colorG = 0.0f, colorB = 0.0f;
+        float alphaAccum = 0.0f;
+
+        float tCurrent = tNear;
+        while (tCurrent < tFar && alphaAccum < alphaAcumLimit) {
+            Point3 pos = d_cameraPos + rayDir * tCurrent;
+
+            // Convert to volume indices
+            float fx = pos.x * (VOLUME_WIDTH  - 1);
+            float fy = pos.y * (VOLUME_HEIGHT - 1);
+            float fz = pos.z * (VOLUME_DEPTH  - 1);
+            int ix = (int)roundf(fx);
+            int iy = (int)roundf(fy);
+            int iz = (int)roundf(fz);
+
+            // Sample
+            float density = sampleVolumeNearest(volumeData, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH, ix, iy, iz);
+
+            // Basic transfer function. TODO: Move to a separate file, and then improve
+            float alphaSample = density * 0.1f;
+            // float alphaSample = 1.0f - expf(-density * 0.1f);
+            Color3 baseColor = Color3::init(density, 0.1f*density, 1.f - density);  // TODO: Implement a proper transfer function
+
+            // If density ~ 0, skip shading
+            if (density > minAllowedDensity) {
+                Vec3 grad = computeGradient(volumeData, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH, ix, iy, iz);
+                Vec3 normal = -grad.normalize();
+
+                Vec3 lightDir = (d_lightPos - pos).normalize();
+                Vec3 viewDir  = -rayDir.normalize();
+
+                // Apply Phong
+                Vec3 shadedColor = phongShading(normal, lightDir, viewDir, baseColor);
+
+                // Compose
+                colorR     += (1.0f - alphaAccum) * shadedColor.x * alphaSample;
+                colorG     += (1.0f - alphaAccum) * shadedColor.y * alphaSample;
+                colorB     += (1.0f - alphaAccum) * shadedColor.z * alphaSample;
+                alphaAccum += (1.0f - alphaAccum) * alphaSample;
+            }
+
+            tCurrent += stepSize;
+        }
+
+        accumR += colorR;
+        accumG += colorG;
+        accumB += colorB;
+    }
+
+    // Average samples
+    accumR /= (float)SAMPLES_PER_PIXEL;
+    accumG /= (float)SAMPLES_PER_PIXEL;
+    accumB /= (float)SAMPLES_PER_PIXEL;
+
+    // Final colour
+    int fbIndex = (py * IMAGE_WIDTH + px) * 3;
+    framebuffer[fbIndex + 0] = (unsigned char)(fminf(accumR, 1.f) * 255);
+    framebuffer[fbIndex + 1] = (unsigned char)(fminf(accumG, 1.f) * 255);
+    framebuffer[fbIndex + 2] = (unsigned char)(fminf(accumB, 1.f) * 255);
+}
+
+#endif // RAYCASTER_H
\ No newline at end of file
diff --git a/src/illumination/shading.h b/src/illumination/shading.h
new file mode 100644
index 0000000..fa3ee8e
--- /dev/null
+++ b/src/illumination/shading.h
@@ -0,0 +1,20 @@
+#ifndef SHADING_H
+#define SHADING_H
+
+#include "linalg/linalg.h"
+#include "consts.h"
+
+// TODO: Consider wrapping this in a class (?)
+__device__ Vec3 phongShading(const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& baseColor) {
+    Vec3 ambient = baseColor * ambientStrength;
+    double diff = fmax(normal.dot(lightDir), 0.0);
+    Vec3 diffuse = baseColor * (diffuseStrength * diff);
+
+    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
+    double spec = pow(fmax(viewDir.dot(reflectDir), 0.0), shininess);
+    Vec3 specular = Vec3::init(1.0, 1.0, 1.0) * (specularStrength * spec);
+
+    return ambient + diffuse + specular;
+}
+
+#endif // SHADING_H
\ No newline at end of file
diff --git a/src/linalg/mat.h b/src/linalg/mat.h
index 6f70f09..92f6fdc 100644
--- a/src/linalg/mat.h
+++ b/src/linalg/mat.h
@@ -1 +1,24 @@
 #pragma once
+
+__device__ Vec3 computeGradient(float* volumeData, const int volW, const int volH, const int volD, int x, int y, int z) {
+    // Finite difference for partial derivatives.
+    // For boundary voxels - clamp to the boundary. 
+    // Normal should point from higher to lower intensities
+
+    int xm = max(x - 1, 0);
+    int xp = min(x + 1, volW  - 1);
+    int ym = max(y - 1, 0);
+    int yp = min(y + 1, volH - 1);
+    int zm = max(z - 1, 0);
+    int zp = min(z + 1, volD  - 1);
+
+    // Note: Assuming data is linearized (idx = z*w*h + y*w + x) TODO: Unlinearize if data not linear
+    float gx = volumeData[z  * volW * volH + y  * volW + xp]
+             - volumeData[z  * volW * volH + y  * volW + xm];
+    float gy = volumeData[z  * volW * volH + yp * volW + x ]
+             - volumeData[z  * volW * volH + ym * volW + x ];
+    float gz = volumeData[zp * volW * volH + y  * volW + x ]
+             - volumeData[zm * volW * volH + y  * volW + x ];
+
+    return Vec3::init(gx, gy, gz);
+}
diff --git a/src/linalg/vec.h b/src/linalg/vec.h
index ef505bd..9fc6745 100644
--- a/src/linalg/vec.h
+++ b/src/linalg/vec.h
@@ -3,16 +3,28 @@
 #include <cuda_runtime.h>
 #include <cmath>
 
-struct Vec3 {  // TODO: Maybe make this into a class
+struct Vec3 {  // TODO: Maybe make this into a class ... maybe
     double x, y, z;
 
-    __host__ __device__ Vec3() : x(0), y(0), z(0) {}
-    __host__ __device__ Vec3(double x, double y, double z) : x(x), y(y), z(z) {}
+    static __host__ __device__ Vec3 init(double x, double y, double z) {Vec3 v = {x, y, z}; return v;}
+    static __host__ __device__ Vec3 zero() { return Vec3::init(0, 0, 0); }
+    static __host__ __device__ Vec3 init() { return zero(); }
+    static __host__ __device__ Vec3 init(const double (&arr)[3]) { return Vec3::init(arr[0], arr[1], arr[2]); }
+
+    __host__ __device__ Vec3 operator+(const Vec3& b) const { return Vec3::init(x + b.x, y + b.y, z + b.z); }
+    __host__ __device__ Vec3& operator+=(const Vec3& b) { x += b.x; y += b.y; z += b.z; return *this; }
+    
+    __host__ __device__ Vec3 operator-() const { return Vec3::init(-x, -y, -z); }
+    __host__ __device__ Vec3 operator-(const Vec3& b) const { return Vec3::init(x - b.x, y - b.y, z - b.z); }
+    __host__ __device__ Vec3& operator-=(const Vec3& b) { x -= b.x; y -= b.y; z -= b.z; return *this; }
+    
+    __host__ __device__ Vec3 operator*(double b) const { return Vec3::init(x * b, y * b, z * b); }
+    __host__ __device__ Vec3& operator*=(double b) { x *= b; y *= b; z *= b; return *this; }
 
-    __host__ __device__ Vec3 operator+(const Vec3& b) const { return Vec3(x + b.x, y + b.y, z + b.z); }
-    __host__ __device__ Vec3 operator-(const Vec3& b) const { return Vec3(x - b.x, y - b.y, z - b.z); }
-    __host__ __device__ Vec3 operator*(double b) const { return Vec3(x * b, y * b, z * b); }
-    __host__ __device__ Vec3 operator-() const { return Vec3(-x, -y, -z); }
     __host__ __device__ double dot(const Vec3& b) const { return x * b.x + y * b.y + z * b.z; }
-    __host__ __device__ Vec3 normalize() const { double len = sqrt(x * x + y * y + z * z); return Vec3(x / len, y / len, z / len); }
-};
\ No newline at end of file
+    __host__ __device__ Vec3 cross(const Vec3& b) const { return Vec3::init(y * b.z - z * b.y, z * b.x - x * b.z, x * b.y - y * b.x); }
+    __host__ __device__ Vec3 normalize() const { double len = sqrt(x * x + y * y + z * z); return Vec3::init(x / len, y / len, z / len); }
+};
+
+typedef Vec3 Point3;
+typedef Vec3 Color3;
\ No newline at end of file
diff --git a/src/main.cu b/src/main.cu
index fafaec8..3c7b38a 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -1,102 +1,112 @@
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
 #include <iostream>
+#include <fstream>
 #include <cmath>
+#include <cuda_runtime.h>
+#include <vector>
+#include <algorithm>
 
-#include "linalg/linalg.h"
+#include "hurricanedata/datareader.h"
+#include "linalg/linalg.h" 
 #include "objs/sphere.h"
 #include "img/handler.h"
-
-#define WIDTH 3840
-#define HEIGHT 2160
-#define SAMPLES_PER_PIXEL 8
+#include "consts.h"
+#include "illumination/illumination.h"
 
 
-__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
-    double ambientStrength = 0.1;
-    double diffuseStrength = 0.8;
-    double specularStrength = 0.5;
-    int shininess = 64;
+static float* d_volume = nullptr;
 
-    Vec3 ambient = color * ambientStrength;
-    double diff = max(normal.dot(lightDir), 0.0);
-    Vec3 diffuse = color * (diffuseStrength * diff);
 
-    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
-    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
-    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
-
-    return ambient + diffuse + specular;
+void getTemperature(std::vector<float>& temperatureData, int idx = 0) {
+    std::string path = "data/trimmed";
+    std::string variable = "T";
+    DataReader dataReader(path, variable);
+    size_t dataLength = dataReader.fileLength(idx);
+    temperatureData.resize(dataLength);
+    dataReader.loadFile(temperatureData.data(), idx);
 }
 
-__global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (x >= WIDTH || y >= HEIGHT) return;
+void getSpeed(std::vector<float>& speedData, int idx = 0) {
+    std::string path = "data/trimmed";
+    std::string varU = "U";
+    std::string varV = "V";
 
-    int pixelIndex = (y * WIDTH + x) * 3;
-    Vec3 rayOrigin(0, 0, 0);
-    Vec3 colCum(0, 0, 0);
+    DataReader dataReaderU(path, varU);
+    DataReader dataReaderV(path, varV);
 
-    double spp = static_cast<double>(SAMPLES_PER_PIXEL);
-    for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
-        double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
-        double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
-        Vec3 rayDir(u, v, 1.0);
-        rayDir = rayDir.normalize();
+    size_t dataLength = dataReaderU.fileLength(idx);
+    speedData.resize(dataLength);
+    std::vector<float> uData(dataLength);
+    std::vector<float> vData(dataLength);
 
-        for (int i = 0; i < numSpheres; ++i) {
-            double t;
-            if (spheres[i].intersect(rayOrigin, rayDir, t)) {
-                Vec3 hitPoint = rayOrigin + rayDir * t;
-                Vec3 normal = (hitPoint - spheres[i].center).normalize();
-                Vec3 lightDir = (lightPos - hitPoint).normalize();
-                Vec3 viewDir = -rayDir;
+    dataReaderU.loadFile(uData.data(), idx);
+    dataReaderV.loadFile(vData.data(), idx);
 
-                colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
-            }
-        }
+    for (int i = 0; i < dataLength; i++) {
+        speedData[i] = sqrt(uData[i]*uData[i] + vData[i]*vData[i]);
+    }
+}
+
+int main(int argc, char** argv) {
+    std::vector<float> data;
+    // getTemperature(data);
+    getSpeed(data);
+
+
+    // TODO: Eveontually remove debug below (i.e., eliminate for-loop etc.)
+    // Generate debug volume data
+    float* hostVolume = new float[VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH];
+    // generateVolume(hostVolume, VOLUME_WIDTH, VOLUME_HEIGHT, VOLUME_DEPTH);
+    for (int i = 0; i < VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH; i++) {  // TODO: This is technically an unnecessary artifact of the old code taking in a float* instead of a std::vector
+        // Discard temperatures above a small star (supposedly, missing temperature values)
+        hostVolume[i] = data[i];
+        if (data[i] + epsilon >= infty) hostVolume[i] = 0.0f;
     }
 
-    // Average color across all samples
-    Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
+    // Min-max normalization
+    float minVal = *std::min_element(hostVolume, hostVolume + VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH);
+    float maxVal = *std::max_element(hostVolume, hostVolume + VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH);
+    for (int i = 0; i < VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH; i++) {
+        hostVolume[i] = (hostVolume[i] - minVal) / (maxVal - minVal);
+    }
 
-    framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
-    framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
-    framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
-}
-
-
-
-int main() {
-    Sphere spheres[] = {
-        { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) },  // Red sphere
-        { Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
-        { Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) }  // Blue sphere
-    };
-    int numSpheres = sizeof(spheres) / sizeof(Sphere);
-    Vec3 lightPos(5, 5, 0);
+    // Allocate + copy data to GPU
+    size_t volumeSize = sizeof(float) * VOLUME_WIDTH * VOLUME_HEIGHT * VOLUME_DEPTH;
+    cudaMalloc((void**)&d_volume, volumeSize);
+    cudaMemcpy(d_volume, hostVolume, volumeSize, cudaMemcpyHostToDevice);
 
+    // Allocate framebuffer
     unsigned char* d_framebuffer;
-    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
-    Sphere* d_spheres;
-    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
-    cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
-    cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
+    size_t fbSize = IMAGE_WIDTH * IMAGE_HEIGHT * 3 * sizeof(unsigned char);
+    cudaMalloc((void**)&d_framebuffer, fbSize);
+    cudaMemset(d_framebuffer, 0, fbSize);
 
-    dim3 threadsPerBlock(16, 16);
-    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
-                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
-    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
+    // Copy external constants from consts.h to cuda
+    copyConstantsToDevice();
+
+    // Launch kernel
+    dim3 blockSize(16, 16);  // TODO: Figure out a good size for parallelization
+    dim3 gridSize((IMAGE_WIDTH + blockSize.x - 1)/blockSize.x,
+                  (IMAGE_HEIGHT + blockSize.y - 1)/blockSize.y);
+
+    raycastKernel<<<gridSize, blockSize>>>(
+        d_volume,
+        d_framebuffer
+    );
     cudaDeviceSynchronize();
 
-    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
-    saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
+    // Copy framebuffer back to CPU
+    unsigned char* hostFramebuffer = new unsigned char[IMAGE_WIDTH * IMAGE_HEIGHT * 3];
+    cudaMemcpy(hostFramebuffer, d_framebuffer, fbSize, cudaMemcpyDeviceToHost);
 
+    // Export image
+    saveImage("output.ppm", hostFramebuffer, IMAGE_WIDTH, IMAGE_HEIGHT);
+
+    // Cleanup
+    delete[] hostVolume;
+    delete[] hostFramebuffer;
+    cudaFree(d_volume);
     cudaFree(d_framebuffer);
-    cudaFree(d_spheres);
-    delete[] h_framebuffer;
 
-    std::cout << "High-resolution image saved as output.ppm" << std::endl;
+    std::cout << "Phong-DVR rendering done. Image saved to output.ppm" << std::endl;
     return 0;
-}
+}
\ No newline at end of file
diff --git a/src/objs/sphere.h b/src/objs/sphere.h
index a9b20b4..dbac251 100644
--- a/src/objs/sphere.h
+++ b/src/objs/sphere.h
@@ -5,6 +5,7 @@
 
 #include "linalg/linalg.h"
 
+// TODO: This is technically just for debugging, but if it is to be used outside of that, it should be a made into a proper class (I mean, just look at those functions below, it screams "add class attributes")
 struct Sphere {
     Vec3 center;
     double radius;
@@ -20,4 +21,74 @@ struct Sphere {
         t = -b - h;
         return true;
     }
-};
\ No newline at end of file
+};
+
+// A function to generate two concentric spherical shells
+__host__ void generateVolume(float* volumeData, int volW, int volH, int volD) {
+    int cx = volW / 2;
+    int cy = volH / 2;
+    int cz = volD / 2;
+    float maxRadius = static_cast<float>(volW) * 0.5f;
+
+    // Two shells
+    float shell1Inner = 0.2f * maxRadius;
+    float shell1Outer = 0.3f * maxRadius;
+    float shell2Inner = 0.4f * maxRadius;
+    float shell2Outer = 0.5f * maxRadius;
+
+    float shell1Intensity = 0.8f;
+    float shell2Intensity = 0.6f;
+
+    for (int z = 0; z < volD; ++z) {
+        for (int y = 0; y < volH; ++y) {
+            for (int x = 0; x < volW; ++x) {
+                float dx = (float)(x - cx);
+                float dy = (float)(y - cy);
+                float dz = (float)(z - cz);
+                float dist = sqrtf(dx*dx + dy*dy + dz*dz);
+
+                float intensity = 0.0f;
+
+                // Shell 1
+                if (dist >= shell1Inner && dist <= shell1Outer) {
+                    float mid = 0.5f * (shell1Inner + shell1Outer);
+                    if (dist < mid) {
+                        float inFactor = (dist - shell1Inner) / (mid - shell1Inner);
+                        intensity += shell1Intensity * inFactor;
+                    } else {
+                        float outFactor = (shell1Outer - dist) / (shell1Outer - mid);
+                        intensity += shell1Intensity * outFactor;
+                    }
+                }
+
+                // Shell 2
+                if (dist >= shell2Inner && dist <= shell2Outer) {
+                    float mid = 0.5f * (shell2Inner + shell2Outer);
+                    if (dist < mid) {
+                        float inFactor = (dist - shell2Inner) / (mid - shell2Inner);
+                        intensity += shell2Intensity * inFactor;
+                    } else {
+                        float outFactor = (shell2Outer - dist) / (shell2Outer - mid);
+                        intensity += shell2Intensity * outFactor;
+                    }
+                }
+
+                if (intensity > 1.0f) intensity = 1.0f;
+                    volumeData[z * volW * volH + y * volW + x] = intensity;
+            }
+        }
+    }
+}
+
+// Samples the voxel nearest to the given coordinates. TODO: Can be re-used in other places so move
+__device__ float sampleVolumeNearest(float* volumeData, const int volW, const int volH, const int volD, int vx, int vy, int vz) {
+    if (vx < 0) vx = 0;
+    if (vy < 0) vy = 0;
+    if (vz < 0) vz = 0;
+    if (vx >= volW) vx = volW  - 1;
+    if (vy >= volH) vy = volH - 1;
+    if (vz >= volD) vz = volD  - 1;
+
+    int idx = vz * volW * volH + vy * volD + vx;
+    return volumeData[idx];
+}
\ No newline at end of file
diff --git a/src/tests/a.out b/src/tests/a.out
deleted file mode 100755
index 3cf0e96..0000000
Binary files a/src/tests/a.out and /dev/null differ
diff --git a/src/tests/deprecated_test.cu b/src/tests/deprecated_test.cu
deleted file mode 100644
index b661dad..0000000
--- a/src/tests/deprecated_test.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <iostream>
-#include <fstream>
-
-#define WIDTH 800
-#define HEIGHT 600
-
-struct Vec3 {
-    double x, y, z;
-
-    __host__ __device__ Vec3() : x(0), y(0), z(0) {}
-    __host__ __device__ Vec3(double x, double y, double z) : x(x), y(y), z(z) {}
-
-    __host__ __device__ Vec3 operator+(const Vec3& b) const { return Vec3(x + b.x, y + b.y, z + b.z); }
-    __host__ __device__ Vec3 operator-(const Vec3& b) const { return Vec3(x - b.x, y - b.y, z - b.z); }
-    __host__ __device__ Vec3 operator*(double b) const { return Vec3(x * b, y * b, z * b); }
-    __host__ __device__ Vec3 operator-() const { return Vec3(-x, -y, -z); }
-    __host__ __device__ double dot(const Vec3& b) const { return x * b.x + y * b.y + z * b.z; }
-    __host__ __device__ Vec3 normalize() const { double len = sqrt(x * x + y * y + z * z); return Vec3(x / len, y / len, z / len); }
-};
-
-// Simple Phong lighting components
-struct Sphere {
-    Vec3 center;
-    double radius;
-    Vec3 color;
-
-    __device__ bool intersect(const Vec3& rayOrigin, const Vec3& rayDir, double& t) const {
-        Vec3 oc = rayOrigin - center;
-        double b = oc.dot(rayDir);
-        double c = oc.dot(oc) - radius * radius;
-        double h = b * b - c;
-        if (h < 0.0) return false;
-        h = sqrt(h);
-        t = -b - h;
-        return true;
-    }
-};
-
-__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
-    double ambientStrength = 0.1;
-    double diffuseStrength = 0.8;
-    double specularStrength = 0.5;
-    int shininess = 32;
-
-    // Ambient
-    Vec3 ambient = color * ambientStrength;
-
-    // Diffuse
-    double diff = max(normal.dot(lightDir), 0.0);
-    Vec3 diffuse = color * (diffuseStrength * diff);
-
-    // Specular
-    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
-    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
-    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
-
-    return ambient + diffuse + specular;
-}
-
-__global__ void renderKernel(unsigned char* framebuffer, Sphere sphere, Vec3 lightPos) {
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (x >= WIDTH || y >= HEIGHT) return;
-
-    int pixelIndex = (y * WIDTH + x) * 3;
-
-    Vec3 rayOrigin(0, 0, 0);
-    Vec3 rayDir((x - WIDTH / 2.0) / WIDTH, (y - HEIGHT / 2.0) / HEIGHT, 1.0);
-    rayDir = rayDir.normalize();
-
-    double t;
-    if (sphere.intersect(rayOrigin, rayDir, t)) {
-        Vec3 hitPoint = rayOrigin + rayDir * t;
-        Vec3 normal = (hitPoint - sphere.center).normalize();
-        Vec3 lightDir = (lightPos - hitPoint).normalize();
-        Vec3 viewDir = -rayDir;
-
-        Vec3 color = phongShading(hitPoint, normal, lightDir, viewDir, sphere.color);
-
-        framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
-        framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
-        framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
-    } else {
-        framebuffer[pixelIndex] = 0;
-        framebuffer[pixelIndex + 1] = 0;
-        framebuffer[pixelIndex + 2] = 0;
-    }
-}
-
-void saveImage(const char* filename, unsigned char* framebuffer) {
-    std::ofstream imageFile(filename, std::ios::out | std::ios::binary);
-    imageFile << "P6\n" << WIDTH << " " << HEIGHT << "\n255\n";
-    for (int i = 0; i < WIDTH * HEIGHT * 3; i++) {
-        imageFile << framebuffer[i];
-    }
-    imageFile.close();
-}
-
-int main() {
-    // Initialize sphere and light source
-    Sphere sphere = { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) };  // Red sphere
-    Vec3 lightPos(5, 5, 0);
-
-    // Allocate framebuffer on device and host
-    unsigned char* d_framebuffer;
-    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
-    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
-
-    // Launch
-    dim3 threadsPerBlock(16, 16);
-    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
-                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
-    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, sphere, lightPos);
-    cudaDeviceSynchronize();
-
-    // Copy result back to host and save
-    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
-    saveImage("output.ppm", h_framebuffer);
-
-    // Clean up
-    cudaFree(d_framebuffer);
-    delete[] h_framebuffer;
-
-    std::cout << "Image saved as output.ppm" << std::endl;
-    return 0;
-}
diff --git a/src/tests/hello_world b/src/tests/hello_world
deleted file mode 100755
index bb85e75..0000000
Binary files a/src/tests/hello_world and /dev/null differ
diff --git a/src/tests/hello_world.cu b/src/tests/hello_world.cu
deleted file mode 100644
index a3b2c6a..0000000
--- a/src/tests/hello_world.cu
+++ /dev/null
@@ -1,16 +0,0 @@
-#include <iostream>
-#include <cuda_runtime.h>
-
-__global__ void hello_from_gpu() {
-    printf("Hello from GPU!\n");
-}
-
-int main() {
-    hello_from_gpu<<<1, 1>>>();
-
-    cudaDeviceSynchronize();
-
-    // Reset device
-    cudaDeviceReset();
-    return 0;
-}
diff --git a/src/tests/output.ppm b/src/tests/output.ppm
deleted file mode 100644
index 1d7b780..0000000
Binary files a/src/tests/output.ppm and /dev/null differ
diff --git a/src/tests/test_heavy.cu b/src/tests/test_heavy.cu
deleted file mode 100644
index 1f70108..0000000
--- a/src/tests/test_heavy.cu
+++ /dev/null
@@ -1,102 +0,0 @@
-#include <cuda_runtime.h>
-#include <device_launch_parameters.h>
-#include <iostream>
-#include <cmath>
-
-#include "linalg/linalg.h"
-#include "sphere.h"
-#include "img/handler.h"
-
-#define WIDTH 3840
-#define HEIGHT 2160
-#define SAMPLES_PER_PIXEL 8
-
-
-__device__ Vec3 phongShading(const Vec3& point, const Vec3& normal, const Vec3& lightDir, const Vec3& viewDir, const Vec3& color) {
-    double ambientStrength = 0.1;
-    double diffuseStrength = 0.8;
-    double specularStrength = 0.5;
-    int shininess = 64;
-
-    Vec3 ambient = color * ambientStrength;
-    double diff = max(normal.dot(lightDir), 0.0);
-    Vec3 diffuse = color * (diffuseStrength * diff);
-
-    Vec3 reflectDir = (normal * (2.0 * normal.dot(lightDir)) - lightDir).normalize();
-    double spec = pow(max(viewDir.dot(reflectDir), 0.0), shininess);
-    Vec3 specular = Vec3(1.0, 1.0, 1.0) * (specularStrength * spec);
-
-    return ambient + diffuse + specular;
-}
-
-__global__ void renderKernel(unsigned char* framebuffer, Sphere* spheres, int numSpheres, Vec3 lightPos) {
-    int x = blockIdx.x * blockDim.x + threadIdx.x;
-    int y = blockIdx.y * blockDim.y + threadIdx.y;
-    if (x >= WIDTH || y >= HEIGHT) return;
-
-    int pixelIndex = (y * WIDTH + x) * 3;
-    Vec3 rayOrigin(0, 0, 0);
-    Vec3 colCum(0, 0, 0);
-
-    double spp = static_cast<double>(SAMPLES_PER_PIXEL);
-    for (int sample = 0; sample < SAMPLES_PER_PIXEL; sample++) {
-        double u = (x + (sample / spp) - WIDTH / 2.0) / WIDTH;
-        double v = (y + (sample / spp) - HEIGHT / 2.0) / HEIGHT;
-        Vec3 rayDir(u, v, 1.0);
-        rayDir = rayDir.normalize();
-
-        for (int i = 0; i < numSpheres; ++i) {
-            double t;
-            if (spheres[i].intersect(rayOrigin, rayDir, t)) {
-                Vec3 hitPoint = rayOrigin + rayDir * t;
-                Vec3 normal = (hitPoint - spheres[i].center).normalize();
-                Vec3 lightDir = (lightPos - hitPoint).normalize();
-                Vec3 viewDir = -rayDir;
-
-                colCum = colCum + phongShading(hitPoint, normal, lightDir, viewDir, spheres[i].color);
-            }
-        }
-    }
-
-    // Average color across all samples
-    Vec3 color = colCum * (1.0 / SAMPLES_PER_PIXEL);
-
-    framebuffer[pixelIndex] = static_cast<unsigned char>(fmin(color.x, 1.0) * 255);
-    framebuffer[pixelIndex + 1] = static_cast<unsigned char>(fmin(color.y, 1.0) * 255);
-    framebuffer[pixelIndex + 2] = static_cast<unsigned char>(fmin(color.z, 1.0) * 255);
-}
-
-
-
-int main() {
-    Sphere spheres[] = {
-        { Vec3(0, 0, 5), 1.0, Vec3(1.0, 0.0, 0.0) },  // Red sphere
-        { Vec3(-2, 1, 7), 1.0, Vec3(0.0, 1.0, 0.0) }, // Green sphere
-        { Vec3(2, -1, 6), 1.0, Vec3(0.0, 0.0, 1.0) }  // Blue sphere
-    };
-    int numSpheres = sizeof(spheres) / sizeof(Sphere);
-    Vec3 lightPos(5, 5, 0);
-
-    unsigned char* d_framebuffer;
-    unsigned char* h_framebuffer = new unsigned char[WIDTH * HEIGHT * 3];
-    Sphere* d_spheres;
-    cudaMalloc(&d_framebuffer, WIDTH * HEIGHT * 3);
-    cudaMalloc(&d_spheres, numSpheres * sizeof(Sphere));
-    cudaMemcpy(d_spheres, spheres, numSpheres * sizeof(Sphere), cudaMemcpyHostToDevice);
-
-    dim3 threadsPerBlock(16, 16);
-    dim3 numBlocks((WIDTH + threadsPerBlock.x - 1) / threadsPerBlock.x, 
-                   (HEIGHT + threadsPerBlock.y - 1) / threadsPerBlock.y);
-    renderKernel<<<numBlocks, threadsPerBlock>>>(d_framebuffer, d_spheres, numSpheres, lightPos);
-    cudaDeviceSynchronize();
-
-    cudaMemcpy(h_framebuffer, d_framebuffer, WIDTH * HEIGHT * 3, cudaMemcpyDeviceToHost);
-    saveImage("output.ppm", h_framebuffer, WIDTH, HEIGHT);
-
-    cudaFree(d_framebuffer);
-    cudaFree(d_spheres);
-    delete[] h_framebuffer;
-
-    std::cout << "High-resolution image saved as output.ppm" << std::endl;
-    return 0;
-}