Skip to content

Commit

Permalink
Cloned NetCDF support as a template for JPEG. Implemented JPEG valida…
Browse files Browse the repository at this point in the history
…tion. refs: #277
  • Loading branch information
spanezz committed Nov 24, 2021
1 parent 57c4f37 commit 4f6cb4d
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 4 deletions.
5 changes: 4 additions & 1 deletion arki/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ nobase_dist_arkiinclude_HEADERS = \
scan/mock.h \
scan/odimh5.h \
scan/netcdf.h \
scan/jpeg.h \
segment.h \
segment/fwd.h \
segment/base.h \
Expand Down Expand Up @@ -339,7 +340,8 @@ libarkimet_la_SOURCES += \
scan/validator.cc \
scan/mock.cc \
scan/odimh5.cc \
scan/netcdf.cc
scan/netcdf.cc \
scan/jpeg.cc

libarkimet_test_la_SOURCES += \
tests/daemon.cc \
Expand Down Expand Up @@ -440,6 +442,7 @@ tests_arki_test_SOURCES += \
scan/mock-test.cc \
scan/odimh5-test.cc \
scan/netcdf-test.cc \
scan/jpeg-test.cc \
segment-test.cc \
segment/common-test.cc \
segment/missing-test.cc \
Expand Down
2 changes: 2 additions & 0 deletions arki/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ libarkimet_sources = [
'scan/mock.cc',
'scan/odimh5.cc',
'scan/netcdf.cc',
'scan/jpeg.cc',
'metadata/xargs.cc',
'dataset.cc',
'dataset/lock.cc',
Expand Down Expand Up @@ -302,6 +303,7 @@ test_arkimet_sources = [
'scan/mock-test.cc',
'scan/odimh5-test.cc',
'scan/netcdf-test.cc',
'scan/jpeg-test.cc',
'segment-test.cc',
'segment/common-test.cc',
'segment/missing-test.cc',
Expand Down
7 changes: 4 additions & 3 deletions arki/runtest
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,10 @@ ulimit -n 1024 || true

mkdir inbound
# Put data in test inbound area
cp -a "$TOP_SRCDIR/test/data/"* inbound/
cp -a "$TOP_SRCDIR/test/postproc" ./
cp -a "$TOP_SRCDIR/test/misc" ./
cp --reflink=auto -a "$TOP_SRCDIR/test/data/"* inbound/
cp --reflink=auto -a "$TOP_SRCDIR/test/postproc" ./
cp --reflink=auto -a "$TOP_SRCDIR/test/misc" ./
gunzip inbound/jpeg/*.gz
# Ensure that the directory can be removed at cleanup
chmod u+w -R "$TESTDIR"
# Create test dataset directories
Expand Down
9 changes: 9 additions & 0 deletions arki/scan-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ add_method("format_from_filename", [] {
wassert(actual(scan::Scanner::format_from_filename("test.nc.tar")) == "nc");
wassert(actual(scan::Scanner::format_from_filename("test.netcdf.tar")) == "nc");

wassert(actual(scan::Scanner::format_from_filename("test.jpg")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpeg")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpg.gz")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpeg.gz")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpg.zip")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpeg.zip")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpg.tar")) == "jpeg");
wassert(actual(scan::Scanner::format_from_filename("test.jpeg.tar")) == "jpeg");

wassert_throws(std::runtime_error, scan::Scanner::format_from_filename("test"));
wassert_throws(std::runtime_error, scan::Scanner::format_from_filename("test.zip"));
wassert_throws(std::runtime_error, scan::Scanner::format_from_filename("test.tar"));
Expand Down
8 changes: 8 additions & 0 deletions arki/scan.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#endif
#include "arki/scan/odimh5.h"
#include "arki/scan/netcdf.h"
#include "arki/scan/jpeg.h"
#ifdef HAVE_VM2
#include "arki/scan/vm2.h"
#endif
Expand Down Expand Up @@ -45,6 +46,7 @@ void init()

register_odimh5_scanner();
register_netcdf_scanner();
register_jpeg_scanner();

#ifdef HAVE_VM2
factories["vm2"] = [] {
Expand Down Expand Up @@ -118,6 +120,9 @@ const Validator& Scanner::get_validator(const std::string& format)
if (format == "nc")
return netcdf::validator();

if (format == "jpeg")
return jpeg::validator();

#ifdef HAVE_VM2
if (format == "vm2")
return vm2::validator();
Expand All @@ -143,6 +148,9 @@ std::string Scanner::normalise_format(const std::string& format, const char* def
if (f == "nc") return "nc";
if (f == "netcdf") return "nc";

if (f == "jpg") return "jpeg";
if (f == "jpeg") return "jpeg";

if (f == "yaml") return "yaml";
if (f == "arkimet") return "arkimet";
if (f == "metadata") return "arkimet";
Expand Down
32 changes: 32 additions & 0 deletions arki/scan/jpeg-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "arki/metadata/tests.h"
#include "arki/scan/jpeg.h"
#include "arki/scan/validator.h"
#include "arki/utils/sys.h"

namespace {
using namespace std;
using namespace arki;
using namespace arki::tests;
using namespace arki::types;
using namespace arki::utils;

class Tests : public TestCase
{
using TestCase::TestCase;
void register_tests() override;
} test("arki_scan_jpeg");

void Tests::register_tests() {

add_method("validator", [] {
sys::File in("inbound/jpeg/autumn.jpg", O_RDONLY);
const scan::Validator& validator = scan::jpeg::validator();
validator.validate_file(in, 0, 94701);

std::string buf = sys::read_file("inbound/jpeg/autumn.jpg");
validator.validate_buf(buf.data(), 94701);
});

}

}
179 changes: 179 additions & 0 deletions arki/scan/jpeg.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
#include "jpeg.h"
#include "arki/metadata.h"
#include "arki/metadata/data.h"
#include "arki/segment.h"
#include "arki/types/source.h"
#include "arki/utils/string.h"
#include "arki/utils/sys.h"
#include "arki/scan/validator.h"
#include "arki/scan/mock.h"
#include <cstring>
#include <unistd.h>
#include <vector>
#include <stdexcept>
#include <string>
#include <sstream>
#include <memory>

using namespace std;
using namespace arki::types;
using namespace arki::utils;

namespace arki {
namespace scan {
namespace jpeg {

struct JPEGValidator : public Validator
{
// For reference about signatures, see https://en.wikipedia.org/wiki/JPEG#Syntax_and_structure

std::string format() const override { return "jpeg"; }

void validate_file(sys::NamedFileDescriptor& fd, off_t offset, size_t size) const override
{
if (size < 4)
throw_check_error(fd, offset, "file segment to check is only " + std::to_string(size) + " bytes (minimum required for JPEG identification is 4)");

// check that the file begins and ends with the right markers
unsigned char buf[2];
ssize_t res;
if ((res = fd.pread(buf, 2, offset)) != 2)
throw_check_error(fd, offset, "read only " + std::to_string(res) + "/2 bytes of JPEG header");

if (buf[0] != 0xff || buf[1] != 0xd8)
throw_check_error(fd, offset, "JPEG Start Of Image signature not found");

if ((res = fd.pread(buf, 2, offset + size - 2)) != 2)
throw_check_error(fd, offset, "read only " + std::to_string(res) + "/2 bytes of JPEG trailer");

if (buf[0] != 0xff || buf[1] != 0xd9)
throw_check_error(fd, offset, "JPEG End Of Image signature not found");
}

void validate_buf(const void* buf, size_t size) const override
{
/* we check that file header is a valid HDF5 header */

if (size < 4)
throw_check_error("buffer is shorter than 4 bytes");

const unsigned char* chunk = reinterpret_cast<const unsigned char*>(buf);
if (chunk[0] != 0xff || chunk[1] != 0xd8)
throw_check_error("JPEG Start Of Image signature not found");

chunk += size - 2;
if (chunk[0] != 0xff || chunk[1] != 0xd9)
throw_check_error("JPEG End Of Image signature not found");
}
};

static JPEGValidator jpeg_validator;

const Validator& validator() { return jpeg_validator; }

}


/*
* JPEGScanner
*/

void JPEGScanner::set_blob_source(Metadata& md, std::shared_ptr<segment::Reader> reader)
{
struct stat st;
sys::stat(reader->segment().abspath, st);
stringstream note;
note << "Scanned from " << str::basename(reader->segment().relpath);
md.add_note(note.str());
md.set_source(Source::createBlob(reader, 0, st.st_size));
}

std::shared_ptr<Metadata> JPEGScanner::scan_nc_data(const std::vector<uint8_t>& data)
{
sys::Tempfile tmpfd;
tmpfd.write_all_or_throw(data.data(), data.size());
return scan_nc_file(tmpfd.name());
}

std::shared_ptr<Metadata> JPEGScanner::scan_data(const std::vector<uint8_t>& data)
{
std::shared_ptr<Metadata> md = scan_nc_data(data);
md->set_source_inline("nc", metadata::DataManager::get().to_data("nc", std::vector<uint8_t>(data)));
return md;
}

std::shared_ptr<Metadata> JPEGScanner::scan_singleton(const std::string& abspath)
{
return scan_nc_file(abspath);
}

bool JPEGScanner::scan_segment(std::shared_ptr<segment::Reader> reader, metadata_dest_func dest)
{
// If the file is empty, skip it
auto st = sys::stat(reader->segment().abspath);
if (!st) return true;
if (S_ISDIR(st->st_mode))
throw std::runtime_error("JPEGH5::scan_segment cannot be called on directory segments");
if (!st->st_size) return true;

auto md = scan_nc_file(reader->segment().abspath);
set_blob_source(*md, reader);
return dest(md);
}

bool JPEGScanner::scan_pipe(core::NamedFileDescriptor& in, metadata_dest_func dest)
{
// Read all in a buffer
std::vector<uint8_t> buf;
const unsigned blocksize = 4096;
while (true)
{
buf.resize(buf.size() + blocksize);
unsigned read = in.read(buf.data() + buf.size() - blocksize, blocksize);
if (read < blocksize)
{
buf.resize(buf.size() - blocksize + read);
break;
}
}

return dest(scan_data(buf));
}


/*
* MockJPEGScanner
*/

MockJPEGScanner::MockJPEGScanner()
{
engine = new MockEngine();
}

MockJPEGScanner::~MockJPEGScanner()
{
delete engine;
}

std::shared_ptr<Metadata> MockJPEGScanner::scan_nc_file(const std::string& pathname)
{
auto buf = sys::read_file(pathname);
return engine->lookup(reinterpret_cast<const uint8_t*>(buf.data()), buf.size());
}

std::shared_ptr<Metadata> MockJPEGScanner::scan_nc_data(const std::vector<uint8_t>& data)
{
return engine->lookup(data.data(), data.size());
}


void register_jpeg_scanner()
{
Scanner::register_factory("nc", [] {
return std::make_shared<scan::MockJPEGScanner>();
});
}

}
}

54 changes: 54 additions & 0 deletions arki/scan/jpeg.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef ARKI_SCAN_JPEG_H
#define ARKI_SCAN_JPEG_H

#include <arki/scan.h>
#include <string>
#include <vector>

namespace arki {
namespace scan {
class MockEngine;

namespace jpeg {
const Validator& validator();
}

class JPEGScanner : public Scanner
{
void set_blob_source(Metadata& md, std::shared_ptr<segment::Reader> reader);

protected:
virtual std::shared_ptr<Metadata> scan_nc_file(const std::string& pathname) = 0;
virtual std::shared_ptr<Metadata> scan_nc_data(const std::vector<uint8_t>& data);

public:
std::string name() const override { return "nc"; }

std::shared_ptr<Metadata> scan_data(const std::vector<uint8_t>& data) override;
bool scan_pipe(core::NamedFileDescriptor& in, metadata_dest_func dest) override;
bool scan_segment(std::shared_ptr<segment::Reader> reader, metadata_dest_func dest) override;
std::shared_ptr<Metadata> scan_singleton(const std::string& abspath) override;
};


class MockJPEGScanner : public JPEGScanner
{
protected:
MockEngine* engine;

std::shared_ptr<Metadata> scan_nc_file(const std::string& pathname) override;
std::shared_ptr<Metadata> scan_nc_data(const std::vector<uint8_t>& data) override;

public:
MockJPEGScanner();
virtual ~MockJPEGScanner();
};


void register_jpeg_scanner();

}
}

#endif

0 comments on commit 4f6cb4d

Please sign in to comment.