initial commit
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
			
		||||
build/
 | 
			
		||||
							
								
								
									
										6
									
								
								CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								CMakeLists.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,6 @@
 | 
			
		||||
project(spices2)
 | 
			
		||||
 | 
			
		||||
set(CMAKE_CXX_STANDARD 20)
 | 
			
		||||
 | 
			
		||||
add_executable(spices src/spices.cpp src/SharedIndex.cpp src/BucketedZstdData.cpp)
 | 
			
		||||
target_link_libraries(spices zstd)
 | 
			
		||||
							
								
								
									
										60
									
								
								src/BucketedZstdData.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								src/BucketedZstdData.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,60 @@
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 | 
			
		||||
#include "BucketedZstdData.hpp"
 | 
			
		||||
 | 
			
		||||
#include <zstd.h>
 | 
			
		||||
#include <cstring>
 | 
			
		||||
 | 
			
		||||
constexpr int headerSize = sizeof(uint32_t);
 | 
			
		||||
constexpr int indexEntrySize = sizeof(uint64_t)*2;
 | 
			
		||||
 | 
			
		||||
BucketedZstdData::BucketedZstdData(std::ifstream &file) : file(file) {}
 | 
			
		||||
 | 
			
		||||
std::optional<std::vector<char>> BucketedZstdData::getDatasetWithId(std::uint32_t id) {
 | 
			
		||||
	file.seekg(0, std::ios::beg);
 | 
			
		||||
 | 
			
		||||
	uint32_t indexSize;
 | 
			
		||||
	file.read((char *)&indexSize, sizeof(indexSize));
 | 
			
		||||
	if(indexSize < id) return {};
 | 
			
		||||
 | 
			
		||||
	//seek to index entry
 | 
			
		||||
	file.seekg(sizeof(uint32_t) + indexEntrySize*id, std::ios::beg);
 | 
			
		||||
 | 
			
		||||
	uint64_t offset, length;
 | 
			
		||||
	file.read((char *)&offset, sizeof(offset));
 | 
			
		||||
	file.read((char *)&length, sizeof(length));
 | 
			
		||||
	if(length == 0) return {};
 | 
			
		||||
 | 
			
		||||
	file.seekg(offset + headerSize + indexSize * indexEntrySize, std::ios::beg);
 | 
			
		||||
	std::vector<char> inBuf(length);
 | 
			
		||||
	file.read(inBuf.data(), inBuf.size());
 | 
			
		||||
 | 
			
		||||
	std::vector<char> output(ZSTD_getFrameContentSize(inBuf.data(), inBuf.size()));
 | 
			
		||||
	if(!ZSTD_isError(ZSTD_decompress(output.data(), output.size(), inBuf.data(), inBuf.size()))) {
 | 
			
		||||
		return output;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return {};
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
std::optional<std::vector<std::vector<char>>> BucketedZstdData::getEntriesByID(std::uint32_t id) {
 | 
			
		||||
	std::optional<std::vector<char>> rawData = getDatasetWithId(id);
 | 
			
		||||
	if(!rawData.has_value()) return {};
 | 
			
		||||
 | 
			
		||||
	const char *fileIndex = rawData.value().data();
 | 
			
		||||
 | 
			
		||||
	std::vector<std::vector<char>> output;
 | 
			
		||||
	uint32_t readId;
 | 
			
		||||
	while(fileIndex < rawData.value().data() + rawData.value().size()) {
 | 
			
		||||
		const uint32_t *readId = (uint32_t *)fileIndex;
 | 
			
		||||
		fileIndex += sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
		if(*readId == id) {
 | 
			
		||||
			std::vector<char> &object = output.emplace_back(std::vector<char>(*(uint32_t *)fileIndex));
 | 
			
		||||
 | 
			
		||||
			memcpy(object.data(), fileIndex + sizeof(uint32_t), object.size());
 | 
			
		||||
		}
 | 
			
		||||
		fileIndex += *(uint32_t *)fileIndex + sizeof(uint32_t);
 | 
			
		||||
	}
 | 
			
		||||
	return output;
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										18
									
								
								src/BucketedZstdData.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								src/BucketedZstdData.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,18 @@
 | 
			
		||||
#ifndef BUCKETEDZSTDDATA_H
 | 
			
		||||
#define BUCKETEDZSTDDATA_H
 | 
			
		||||
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <optional>
 | 
			
		||||
#include <fstream>
 | 
			
		||||
 | 
			
		||||
class BucketedZstdData {
 | 
			
		||||
	public:
 | 
			
		||||
	BucketedZstdData(std::ifstream &file);
 | 
			
		||||
	std::optional<std::vector<char>> getDatasetWithId(std::uint32_t id);
 | 
			
		||||
	std::optional<std::vector<std::vector<char>>> getEntriesByID(std::uint32_t id);
 | 
			
		||||
	private:
 | 
			
		||||
	std::ifstream &file;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										21
									
								
								src/SharedIndex.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								src/SharedIndex.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,21 @@
 | 
			
		||||
#include "SharedIndex.hpp"
 | 
			
		||||
 | 
			
		||||
#include <cstring>
 | 
			
		||||
#include <iostream>
 | 
			
		||||
 | 
			
		||||
SharedIndex::SharedIndex(const std::vector<char> &data) : data(data) {}
 | 
			
		||||
 | 
			
		||||
std::optional<std::uint64_t> SharedIndex::getID(const std::vector<char> &datasetName) {
 | 
			
		||||
	const char *fileIndex = data.data();
 | 
			
		||||
	fileIndex += sizeof(uint32_t);
 | 
			
		||||
 | 
			
		||||
	for(uint64_t entryN = 0;; ++entryN ) {
 | 
			
		||||
		if(fileIndex >= data.data() + data.size()) return {};
 | 
			
		||||
 | 
			
		||||
		if(*fileIndex == datasetName.size()) {
 | 
			
		||||
			if(!std::memcmp(fileIndex + 1, datasetName.data(), datasetName.size())) return entryN;
 | 
			
		||||
		}
 | 
			
		||||
 | 
			
		||||
		fileIndex += *fileIndex + 1;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
							
								
								
									
										16
									
								
								src/SharedIndex.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								src/SharedIndex.hpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,16 @@
 | 
			
		||||
#ifndef SHAREDINDEX_HPP
 | 
			
		||||
#define SHAREDINDEX_HPP
 | 
			
		||||
 | 
			
		||||
#include <cstdint>
 | 
			
		||||
#include <vector>
 | 
			
		||||
#include <optional>
 | 
			
		||||
 | 
			
		||||
class SharedIndex {
 | 
			
		||||
	public:
 | 
			
		||||
	SharedIndex(const std::vector<char> &data);
 | 
			
		||||
	std::optional<std::uint64_t> getID(const std::vector<char> &datasetName);
 | 
			
		||||
	private:
 | 
			
		||||
	const std::vector<char> &data;
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
							
								
								
									
										84
									
								
								src/spices.cpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										84
									
								
								src/spices.cpp
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,84 @@
 | 
			
		||||
#include <iostream>
 | 
			
		||||
#include <fstream>
 | 
			
		||||
#include <filesystem>
 | 
			
		||||
 | 
			
		||||
#ifdef _WIN32
 | 
			
		||||
#include <io.h>
 | 
			
		||||
#define IS_REDIRECTED !(_isatty(_fileno(stdout)))
 | 
			
		||||
 | 
			
		||||
#elif __unix__
 | 
			
		||||
#include <unistd.h>
 | 
			
		||||
#define IS_REDIRECTED !(isatty(fileno(stdout)))
 | 
			
		||||
 | 
			
		||||
#else
 | 
			
		||||
#warning "Redirection cannot be checked, will always asume to be redirected"
 | 
			
		||||
#define IS_REDIRECTED true
 | 
			
		||||
 | 
			
		||||
#endif
 | 
			
		||||
 | 
			
		||||
#include "SharedIndex.hpp"
 | 
			
		||||
#include "BucketedZstdData.hpp"
 | 
			
		||||
 | 
			
		||||
std::optional<std::vector<char>> readSharedIndex(const std::filesystem::path &filePath) {
 | 
			
		||||
	std::ifstream sharedIndexFile(filePath, std::ios::binary | std::ios::ate);
 | 
			
		||||
	if(!sharedIndexFile.good()) {
 | 
			
		||||
		return {};
 | 
			
		||||
	}
 | 
			
		||||
	std::vector<char> sharedIndexData(sharedIndexFile.tellg());
 | 
			
		||||
	sharedIndexFile.seekg(0, std::ios::beg);
 | 
			
		||||
 | 
			
		||||
	sharedIndexFile.read(sharedIndexData.data(), sharedIndexData.size());
 | 
			
		||||
	return sharedIndexData;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
int main(int argc, char **argv) {
 | 
			
		||||
	if(argc != 3 && argc != 4) {
 | 
			
		||||
		std::cerr << "usage: subreddit rootdirectory [force write to terminal(true | false)]" << std::endl;
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if(!IS_REDIRECTED && (argc != 4 || std::string(argv[3]) != "true")) {
 | 
			
		||||
		std::cerr << "output is not redirected, specify you want to write to the terminal" << std::endl;
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	const std::filesystem::path rootDirectory(argv[2]);
 | 
			
		||||
	const std::filesystem::path sharedIndexPath(rootDirectory / "sharedindex.shi");
 | 
			
		||||
 | 
			
		||||
	std::cerr << "Loading shared index..." << std::flush;
 | 
			
		||||
	std::vector<char> sharedIndexData;
 | 
			
		||||
	if(auto data = readSharedIndex(sharedIndexPath); data.has_value()) {
 | 
			
		||||
		sharedIndexData.swap(data.value());
 | 
			
		||||
	} else {
 | 
			
		||||
		std::cerr << "cannot find '" << sharedIndexPath << "'" << std::endl;
 | 
			
		||||
		return 1;
 | 
			
		||||
	}
 | 
			
		||||
		std::cerr << "Loaded shared index" << std::endl;
 | 
			
		||||
 | 
			
		||||
	std::string datesetString = argv[1];
 | 
			
		||||
	std::vector<char> datasetName(datesetString.begin(), datesetString.end());
 | 
			
		||||
 | 
			
		||||
	SharedIndex sharedIndex(sharedIndexData);
 | 
			
		||||
 | 
			
		||||
	std::cerr << "Fetching ID from shared index... " << std::flush;
 | 
			
		||||
	size_t totalEntries = 0;
 | 
			
		||||
	if(auto id = sharedIndex.getID(datasetName)) {
 | 
			
		||||
		std::cerr << "Found ID: " << id.value() << std::endl;
 | 
			
		||||
		for(const auto &file : std::filesystem::directory_iterator(rootDirectory)) {
 | 
			
		||||
			if(file.path().extension() == ".rda") {
 | 
			
		||||
				std::cerr << "Reading: " << file.path() << "... " << std::flush;
 | 
			
		||||
				std::ifstream fileStream(file.path(), std::ios::binary);
 | 
			
		||||
				BucketedZstdData bucket(fileStream);
 | 
			
		||||
 | 
			
		||||
				if(auto data = bucket.getEntriesByID(id.value()); data.has_value()) {
 | 
			
		||||
					std::cerr << "Found " << data.value().size() << " entries" << std::endl;
 | 
			
		||||
					totalEntries += data.value().size();
 | 
			
		||||
					for(const auto &entry : data.value()) {
 | 
			
		||||
						std::cout.write(entry.data(), entry.size()) << '\n';
 | 
			
		||||
					}
 | 
			
		||||
				}
 | 
			
		||||
			}
 | 
			
		||||
		}
 | 
			
		||||
		std::cerr << "Found a total of " << totalEntries << " entries" << std::endl;
 | 
			
		||||
	}
 | 
			
		||||
}
 | 
			
		||||
		Reference in New Issue
	
	Block a user