major refactor: separated reading into a library
This commit is contained in:
parent
016640d392
commit
5fc5ba6032
@ -1,6 +1,9 @@
|
|||||||
project(spices2)
|
project(spices2)
|
||||||
|
|
||||||
|
cmake_minimum_required(VERSION 3.18)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 20)
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
|
|
||||||
add_executable(spices src/spices.cpp src/SharedIndex.cpp src/BucketedZstdData.cpp)
|
add_subdirectory(src/RdaReader)
|
||||||
target_link_libraries(spices zstd)
|
|
||||||
|
add_subdirectory(src/rdaExtractor)
|
||||||
|
7
src/RdaReader/CMakeLists.txt
Normal file
7
src/RdaReader/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
project(RdaReader)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
|
|
||||||
|
add_library(RdaReader src/RdaReader src/SharedIndex.cpp src/BucketedZstdData.cpp)
|
||||||
|
target_include_directories(RdaReader PUBLIC include)
|
||||||
|
target_link_libraries(RdaReader zstd)
|
44
src/RdaReader/include/RdaReader.hpp
Normal file
44
src/RdaReader/include/RdaReader.hpp
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#ifndef RDAREADER_HPP
|
||||||
|
#define RDAREADER_HPP
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
|
#include <iostream>
|
||||||
|
#include <vector>
|
||||||
|
#include <mutex>
|
||||||
|
#include <functional>
|
||||||
|
|
||||||
|
class RdaReader {
|
||||||
|
public:
|
||||||
|
RdaReader();
|
||||||
|
RdaReader(std::function<void(const std::string &)> logger);
|
||||||
|
size_t readDataset(
|
||||||
|
const std::vector<char> &datasetName,
|
||||||
|
const std::vector<char> &sharedIndex,
|
||||||
|
const std::vector<std::istream *> &rdas,
|
||||||
|
std::ostream &output
|
||||||
|
);
|
||||||
|
size_t readDataset(
|
||||||
|
const std::vector<char> &datasetName,
|
||||||
|
const std::vector<char> &sharedIndex,
|
||||||
|
const std::vector<std::istream *> &rdas,
|
||||||
|
std::ostream &output,
|
||||||
|
std::mutex &outputMutex
|
||||||
|
);
|
||||||
|
|
||||||
|
size_t readRda(
|
||||||
|
std::istream &rda,
|
||||||
|
uint64_t id,
|
||||||
|
std::ostream &output
|
||||||
|
);
|
||||||
|
size_t readRda(
|
||||||
|
std::istream &rda,
|
||||||
|
uint64_t id,
|
||||||
|
std::ostream &output,
|
||||||
|
std::mutex &outputMutex
|
||||||
|
);
|
||||||
|
private:
|
||||||
|
const std::function<void(const std::string &)> logger;
|
||||||
|
void log(const std::string &string);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
62
src/RdaReader/src/RdaReader.cpp
Normal file
62
src/RdaReader/src/RdaReader.cpp
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
#include "RdaReader.hpp"
|
||||||
|
#include "SharedIndex.hpp"
|
||||||
|
#include "BucketedZstdData.hpp"
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
#include <execution>
|
||||||
|
#include <algorithm>
|
||||||
|
#include <atomic>
|
||||||
|
|
||||||
|
void RdaReader::log(const std::string &string) {
|
||||||
|
if(logger) logger(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
RdaReader::RdaReader() : logger(nullptr) {}
|
||||||
|
|
||||||
|
RdaReader::RdaReader(std::function<void(const std::string &)> logger) : logger(logger) {}
|
||||||
|
|
||||||
|
size_t RdaReader::readRda(std::istream &rda, uint64_t id, std::ostream &output, std::mutex &outputMutex) {
|
||||||
|
log("Reading an rda\n");
|
||||||
|
BucketedZstdData bucket(rda);
|
||||||
|
|
||||||
|
if(std::optional<std::vector<std::vector<char>>> data = bucket.getEntriesByID(id)) {
|
||||||
|
const std::lock_guard lock(outputMutex);
|
||||||
|
log("Writing " + std::to_string(data.value().size()) + " entries to output\n");
|
||||||
|
for(const auto &entry : data.value()) {
|
||||||
|
output.write(entry.data(), entry.size()) << '\n';
|
||||||
|
}
|
||||||
|
log("Done writing entries\n");
|
||||||
|
return data.value().size();
|
||||||
|
}
|
||||||
|
log("No entries found\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t RdaReader::readRda(std::istream &rda, uint64_t id, std::ostream &output) {
|
||||||
|
std::mutex dummyMutex;
|
||||||
|
return readRda(rda, id, output, dummyMutex);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t RdaReader::readDataset(const std::vector<char> &datasetName, const std::vector<char> &sharedIndex, const std::vector<std::istream *> &rdas, std::ostream &output, std::mutex &outputMutex) {
|
||||||
|
log("Reading shared index... ");
|
||||||
|
SharedIndex sharedIndexReader(sharedIndex);
|
||||||
|
|
||||||
|
if(std::optional<std::uint64_t> id = sharedIndexReader.getID(datasetName)) {
|
||||||
|
log("Found ID: " + std::to_string(id.value()) + '\n');
|
||||||
|
std::atomic_size_t totalEntries(0);
|
||||||
|
std::for_each(
|
||||||
|
std::execution::par,
|
||||||
|
rdas.begin(),
|
||||||
|
rdas.end(),
|
||||||
|
[this, &totalEntries, &id, &output, &outputMutex](std::istream * const &rda) {totalEntries += readRda(*rda, id.value(), output, outputMutex);}
|
||||||
|
);
|
||||||
|
return totalEntries;
|
||||||
|
}
|
||||||
|
log("No entries found\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t RdaReader::readDataset(const std::vector<char> &datasetName, const std::vector<char> &sharedIndex, const std::vector<std::istream *> &rdas, std::ostream &output) {
|
||||||
|
std::mutex dummyMutex;
|
||||||
|
return readDataset(datasetName, sharedIndex, rdas, output, dummyMutex);
|
||||||
|
}
|
6
src/rdaExtractor/CMakeLists.txt
Normal file
6
src/rdaExtractor/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
project(rdaExtractor)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
|
|
||||||
|
add_executable(RdaReaderExec src/rdaExtractor.cpp)
|
||||||
|
target_link_libraries(RdaReaderExec RdaReader)
|
80
src/rdaExtractor/src/rdaExtractor.cpp
Normal file
80
src/rdaExtractor/src/rdaExtractor.cpp
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
#include <RdaReader.hpp>
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
|
#ifdef _WIN32
|
||||||
|
#include <io.h>
|
||||||
|
#define IS_REDIRECTED !(_isatty(_fileno(stdout)))
|
||||||
|
|
||||||
|
#elif __unix__
|
||||||
|
#include <unistd.h>
|
||||||
|
#define IS_REDIRECTED !(isatty(fileno(stdout)))
|
||||||
|
|
||||||
|
#else
|
||||||
|
#warning "Redirection cannot be checked, will always asume to be redirected"
|
||||||
|
#define IS_REDIRECTED true
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::mutex cerrMutex;
|
||||||
|
|
||||||
|
void threadedLog(const std::string &input) {
|
||||||
|
std::lock_guard lock(cerrMutex);
|
||||||
|
std::cerr << input << std::flush;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::optional<std::vector<char>> readSharedIndex(const std::filesystem::path &filePath) {
|
||||||
|
std::ifstream sharedIndexFile(filePath, std::ios::binary | std::ios::ate);
|
||||||
|
if(!sharedIndexFile.good()) {
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
std::vector<char> sharedIndexData(sharedIndexFile.tellg());
|
||||||
|
sharedIndexFile.seekg(0, std::ios::beg);
|
||||||
|
|
||||||
|
sharedIndexFile.read(sharedIndexData.data(), sharedIndexData.size());
|
||||||
|
return sharedIndexData;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
if(argc != 3 && argc != 4) {
|
||||||
|
std::cerr << "usage: datesetname rootdirectory [force write to terminal(true | false)]" << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!IS_REDIRECTED && (argc != 4 || std::string(argv[3]) != "true")) {
|
||||||
|
std::cerr << "output is not redirected, specify you want to write to the terminal" << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string datesetString = argv[1];
|
||||||
|
std::vector<char> datasetName(datesetString.begin(), datesetString.end());
|
||||||
|
|
||||||
|
const std::filesystem::path rootDirectory(argv[2]);
|
||||||
|
const std::filesystem::path sharedIndexPath(rootDirectory / "sharedindex.shi");
|
||||||
|
|
||||||
|
std::vector<char> sharedIndexData;
|
||||||
|
if(auto data = readSharedIndex(sharedIndexPath)) {
|
||||||
|
sharedIndexData = data.value();
|
||||||
|
} else {
|
||||||
|
std::cerr << "cannot find '" << sharedIndexPath << "'" << std::endl;
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::ifstream> rdasIfstreams;
|
||||||
|
for(const std::filesystem::directory_entry &file : std::filesystem::directory_iterator(rootDirectory)) {
|
||||||
|
if(file.path().extension() == ".rda") rdasIfstreams.emplace_back(std::ifstream(file.path(), std::ios::binary));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::istream *> rdaRefs;
|
||||||
|
for(std::ifstream &stream : rdasIfstreams) {
|
||||||
|
rdaRefs.push_back(&stream);
|
||||||
|
}
|
||||||
|
RdaReader rdaReader(threadedLog);
|
||||||
|
if(size_t totalRead = rdaReader.readDataset(datasetName, sharedIndexData, rdaRefs, std::cout)) {
|
||||||
|
std::cerr << "Found a total of " << totalRead << " entries" << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Cannot find '" << argv[1] << "' in the shared index" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
114
src/spices.cpp
114
src/spices.cpp
@ -1,114 +0,0 @@
|
|||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <execution>
|
|
||||||
#include <algorithm>
|
|
||||||
#include <mutex>
|
|
||||||
#include <atomic>
|
|
||||||
|
|
||||||
#include "SharedIndex.hpp"
|
|
||||||
#include "BucketedZstdData.hpp"
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
#include <io.h>
|
|
||||||
#define IS_REDIRECTED !(_isatty(_fileno(stdout)))
|
|
||||||
|
|
||||||
#elif __unix__
|
|
||||||
#include <unistd.h>
|
|
||||||
#define IS_REDIRECTED !(isatty(fileno(stdout)))
|
|
||||||
|
|
||||||
#else
|
|
||||||
#warning "Redirection cannot be checked, will always asume to be redirected"
|
|
||||||
#define IS_REDIRECTED true
|
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::mutex cerrMutex;
|
|
||||||
|
|
||||||
#define CERRLOG(...) \
|
|
||||||
{ \
|
|
||||||
std::lock_guard lock(cerrMutex); \
|
|
||||||
fprintf(stderr, __VA_ARGS__); \
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
std::optional<std::vector<char>> readSharedIndex(const std::filesystem::path &filePath) {
|
|
||||||
std::ifstream sharedIndexFile(filePath, std::ios::binary | std::ios::ate);
|
|
||||||
if(!sharedIndexFile.good()) {
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
std::vector<char> sharedIndexData(sharedIndexFile.tellg());
|
|
||||||
sharedIndexFile.seekg(0, std::ios::beg);
|
|
||||||
|
|
||||||
sharedIndexFile.read(sharedIndexData.data(), sharedIndexData.size());
|
|
||||||
return sharedIndexData;
|
|
||||||
}
|
|
||||||
|
|
||||||
void processRDA(const std::filesystem::directory_entry &file, std::atomic_size_t &totalEntries, uint64_t id, std::mutex &outputMutex) {
|
|
||||||
if(file.path().extension() == ".rda") {
|
|
||||||
std::string fileName(file.path().filename());
|
|
||||||
std::ifstream fileStream(file.path(), std::ios::binary);
|
|
||||||
CERRLOG("Reading %s\n", fileName.c_str());
|
|
||||||
BucketedZstdData bucket(fileStream);
|
|
||||||
|
|
||||||
if(std::optional<std::vector<std::vector<char>>> data = bucket.getEntriesByID(id)) {
|
|
||||||
totalEntries += data.value().size();
|
|
||||||
const std::lock_guard lock(outputMutex);
|
|
||||||
CERRLOG("Writing %s\n", fileName.c_str());
|
|
||||||
for(const auto &entry : data.value()) {
|
|
||||||
std::cout.write(entry.data(), entry.size()) << '\n';
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
CERRLOG("No entries found in %s\n", fileName.c_str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
|
||||||
if(argc != 3 && argc != 4) {
|
|
||||||
std::cerr << "usage: datesetname rootdirectory [force write to terminal(true | false)]" << std::endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(!IS_REDIRECTED && (argc != 4 || std::string(argv[3]) != "true")) {
|
|
||||||
std::cerr << "output is not redirected, specify you want to write to the terminal" << std::endl;
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::filesystem::path rootDirectory(argv[2]);
|
|
||||||
const std::filesystem::path sharedIndexPath(rootDirectory / "sharedindex.shi");
|
|
||||||
|
|
||||||
std::cerr << "Loading shared index..." << std::flush;
|
|
||||||
std::vector<char> sharedIndexData;
|
|
||||||
if(auto data = readSharedIndex(sharedIndexPath)) {
|
|
||||||
sharedIndexData = data.value();
|
|
||||||
} else {
|
|
||||||
std::cerr << "cannot find '" << sharedIndexPath << "'" << std::endl;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
std::cerr << "Loaded shared index" << std::endl;
|
|
||||||
|
|
||||||
std::string datesetString = argv[1];
|
|
||||||
std::vector<char> datasetName(datesetString.begin(), datesetString.end());
|
|
||||||
|
|
||||||
SharedIndex sharedIndex(sharedIndexData);
|
|
||||||
|
|
||||||
std::cerr << "Fetching ID from shared index... " << std::flush;
|
|
||||||
std::atomic_size_t totalEntries(0);
|
|
||||||
if(std::optional<std::uint64_t> id = sharedIndex.getID(datasetName)) {
|
|
||||||
std::cerr << "Found ID: " << id.value() << std::endl;
|
|
||||||
|
|
||||||
std::mutex outputMutex;
|
|
||||||
std::for_each(
|
|
||||||
std::execution::par,
|
|
||||||
std::filesystem::begin(std::filesystem::directory_iterator(rootDirectory)),
|
|
||||||
std::filesystem::end(std::filesystem::directory_iterator()),
|
|
||||||
[&totalEntries, &id, &outputMutex](const auto& file) {processRDA(file, totalEntries, id.value(), outputMutex);}
|
|
||||||
);
|
|
||||||
|
|
||||||
std::cerr << "Found a total of " << totalEntries << " entries" << std::endl;
|
|
||||||
} else {
|
|
||||||
std::cerr << "Cannot find '" << argv[1] << "' in the shared index" << std::endl;
|
|
||||||
return 2;
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in New Issue
Block a user