From 7f492139bf27ea3f4bfe981263a2efb6af43d67c Mon Sep 17 00:00:00 2001 From: Finn Dane Date: Tue, 11 Apr 2023 15:51:00 +0200 Subject: [PATCH] initial commit --- .gitignore | 1 + CMakeLists.txt | 6 +++ src/BucketedZstdData.cpp | 60 ++++++++++++++++++++++++++++ src/BucketedZstdData.hpp | 18 +++++++++ src/SharedIndex.cpp | 21 ++++++++++ src/SharedIndex.hpp | 16 ++++++++ src/spices.cpp | 84 ++++++++++++++++++++++++++++++++++++++++ 7 files changed, 206 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 src/BucketedZstdData.cpp create mode 100644 src/BucketedZstdData.hpp create mode 100644 src/SharedIndex.cpp create mode 100644 src/SharedIndex.hpp create mode 100644 src/spices.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..88c2165 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,6 @@ +project(spices2) + +set(CMAKE_CXX_STANDARD 20) + +add_executable(spices src/spices.cpp src/SharedIndex.cpp src/BucketedZstdData.cpp) +target_link_libraries(spices zstd) \ No newline at end of file diff --git a/src/BucketedZstdData.cpp b/src/BucketedZstdData.cpp new file mode 100644 index 0000000..cc18332 --- /dev/null +++ b/src/BucketedZstdData.cpp @@ -0,0 +1,60 @@ +#include + +#include "BucketedZstdData.hpp" + +#include +#include + +constexpr int headerSize = sizeof(uint32_t); +constexpr int indexEntrySize = sizeof(uint64_t)*2; + +BucketedZstdData::BucketedZstdData(std::ifstream &file) : file(file) {} + +std::optional> BucketedZstdData::getDatasetWithId(std::uint32_t id) { + file.seekg(0, std::ios::beg); + + uint32_t indexSize; + file.read((char *)&indexSize, sizeof(indexSize)); + if(indexSize < id) return {}; + + //seek to index entry + file.seekg(sizeof(uint32_t) + indexEntrySize*id, std::ios::beg); + + uint64_t offset, length; + file.read((char *)&offset, sizeof(offset)); + file.read((char *)&length, sizeof(length)); + if(length == 0) return {}; + + file.seekg(offset + headerSize + indexSize * indexEntrySize, std::ios::beg); + std::vector inBuf(length); + file.read(inBuf.data(), inBuf.size()); + + std::vector output(ZSTD_getFrameContentSize(inBuf.data(), inBuf.size())); + if(!ZSTD_isError(ZSTD_decompress(output.data(), output.size(), inBuf.data(), inBuf.size()))) { + return output; + } + + return {}; +} + +std::optional>> BucketedZstdData::getEntriesByID(std::uint32_t id) { + std::optional> rawData = getDatasetWithId(id); + if(!rawData.has_value()) return {}; + + const char *fileIndex = rawData.value().data(); + + std::vector> output; + uint32_t readId; + while(fileIndex < rawData.value().data() + rawData.value().size()) { + const uint32_t *readId = (uint32_t *)fileIndex; + fileIndex += sizeof(uint32_t); + + if(*readId == id) { + std::vector &object = output.emplace_back(std::vector(*(uint32_t *)fileIndex)); + + memcpy(object.data(), fileIndex + sizeof(uint32_t), object.size()); + } + fileIndex += *(uint32_t *)fileIndex + sizeof(uint32_t); + } + return output; +} \ No newline at end of file diff --git a/src/BucketedZstdData.hpp b/src/BucketedZstdData.hpp new file mode 100644 index 0000000..4fc54f8 --- /dev/null +++ b/src/BucketedZstdData.hpp @@ -0,0 +1,18 @@ +#ifndef BUCKETEDZSTDDATA_H +#define BUCKETEDZSTDDATA_H + +#include +#include +#include +#include + +class BucketedZstdData { + public: + BucketedZstdData(std::ifstream &file); + std::optional> getDatasetWithId(std::uint32_t id); + std::optional>> getEntriesByID(std::uint32_t id); + private: + std::ifstream &file; +}; + +#endif \ No newline at end of file diff --git a/src/SharedIndex.cpp b/src/SharedIndex.cpp new file mode 100644 index 0000000..d6fd336 --- /dev/null +++ b/src/SharedIndex.cpp @@ -0,0 +1,21 @@ +#include "SharedIndex.hpp" + +#include +#include + +SharedIndex::SharedIndex(const std::vector &data) : data(data) {} + +std::optional SharedIndex::getID(const std::vector &datasetName) { + const char *fileIndex = data.data(); + fileIndex += sizeof(uint32_t); + + for(uint64_t entryN = 0;; ++entryN ) { + if(fileIndex >= data.data() + data.size()) return {}; + + if(*fileIndex == datasetName.size()) { + if(!std::memcmp(fileIndex + 1, datasetName.data(), datasetName.size())) return entryN; + } + + fileIndex += *fileIndex + 1; + } +} \ No newline at end of file diff --git a/src/SharedIndex.hpp b/src/SharedIndex.hpp new file mode 100644 index 0000000..38b432f --- /dev/null +++ b/src/SharedIndex.hpp @@ -0,0 +1,16 @@ +#ifndef SHAREDINDEX_HPP +#define SHAREDINDEX_HPP + +#include +#include +#include + +class SharedIndex { + public: + SharedIndex(const std::vector &data); + std::optional getID(const std::vector &datasetName); + private: + const std::vector &data; +}; + +#endif \ No newline at end of file diff --git a/src/spices.cpp b/src/spices.cpp new file mode 100644 index 0000000..49a4ae3 --- /dev/null +++ b/src/spices.cpp @@ -0,0 +1,84 @@ +#include +#include +#include + +#ifdef _WIN32 +#include +#define IS_REDIRECTED !(_isatty(_fileno(stdout))) + +#elif __unix__ +#include +#define IS_REDIRECTED !(isatty(fileno(stdout))) + +#else +#warning "Redirection cannot be checked, will always asume to be redirected" +#define IS_REDIRECTED true + +#endif + +#include "SharedIndex.hpp" +#include "BucketedZstdData.hpp" + +std::optional> readSharedIndex(const std::filesystem::path &filePath) { + std::ifstream sharedIndexFile(filePath, std::ios::binary | std::ios::ate); + if(!sharedIndexFile.good()) { + return {}; + } + std::vector sharedIndexData(sharedIndexFile.tellg()); + sharedIndexFile.seekg(0, std::ios::beg); + + sharedIndexFile.read(sharedIndexData.data(), sharedIndexData.size()); + return sharedIndexData; +} + +int main(int argc, char **argv) { + if(argc != 3 && argc != 4) { + std::cerr << "usage: subreddit rootdirectory [force write to terminal(true | false)]" << std::endl; + return 1; + } + + if(!IS_REDIRECTED && (argc != 4 || std::string(argv[3]) != "true")) { + std::cerr << "output is not redirected, specify you want to write to the terminal" << std::endl; + return 1; + } + + const std::filesystem::path rootDirectory(argv[2]); + const std::filesystem::path sharedIndexPath(rootDirectory / "sharedindex.shi"); + + std::cerr << "Loading shared index..." << std::flush; + std::vector sharedIndexData; + if(auto data = readSharedIndex(sharedIndexPath); data.has_value()) { + sharedIndexData.swap(data.value()); + } else { + std::cerr << "cannot find '" << sharedIndexPath << "'" << std::endl; + return 1; + } + std::cerr << "Loaded shared index" << std::endl; + + std::string datesetString = argv[1]; + std::vector datasetName(datesetString.begin(), datesetString.end()); + + SharedIndex sharedIndex(sharedIndexData); + + std::cerr << "Fetching ID from shared index... " << std::flush; + size_t totalEntries = 0; + if(auto id = sharedIndex.getID(datasetName)) { + std::cerr << "Found ID: " << id.value() << std::endl; + for(const auto &file : std::filesystem::directory_iterator(rootDirectory)) { + if(file.path().extension() == ".rda") { + std::cerr << "Reading: " << file.path() << "... " << std::flush; + std::ifstream fileStream(file.path(), std::ios::binary); + BucketedZstdData bucket(fileStream); + + if(auto data = bucket.getEntriesByID(id.value()); data.has_value()) { + std::cerr << "Found " << data.value().size() << " entries" << std::endl; + totalEntries += data.value().size(); + for(const auto &entry : data.value()) { + std::cout.write(entry.data(), entry.size()) << '\n'; + } + } + } + } + std::cerr << "Found a total of " << totalEntries << " entries" << std::endl; + } +} \ No newline at end of file