From a0cfd7553f7d19e900391743e773f8698b552076 Mon Sep 17 00:00:00 2001 From: arvidn Date: Sun, 20 Oct 2024 15:55:33 +0200 Subject: [PATCH] add tool to scrape DHT for random torrent files --- examples/Jamfile | 1 + examples/dht_scrape.cpp | 301 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 302 insertions(+) create mode 100644 examples/dht_scrape.cpp diff --git a/examples/Jamfile b/examples/Jamfile index 45b4f8f591..9151f46f28 100644 --- a/examples/Jamfile +++ b/examples/Jamfile @@ -44,6 +44,7 @@ exe make_torrent : make_torrent.cpp ; exe connection_tester : connection_tester.cpp ; exe upnp_test : upnp_test.cpp ; exe check_files : check_files.cpp ; +exe dht_scrape : dht_scrape.cpp ; explicit stage_client_test ; explicit stage_connection_tester ; diff --git a/examples/dht_scrape.cpp b/examples/dht_scrape.cpp new file mode 100644 index 0000000000..4babe4b9db --- /dev/null +++ b/examples/dht_scrape.cpp @@ -0,0 +1,301 @@ +/* + +Copyright (c) 2024, Arvid Norberg +All rights reserved. + +You may use, distribute and modify this code under the terms of the BSD license, +see LICENSE file. +*/ + +#include "libtorrent/session.hpp" +#include "libtorrent/alert_types.hpp" +#include "libtorrent/session_params.hpp" +#include "libtorrent/write_resume_data.hpp" +#include "libtorrent/bencode.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace lt; +using namespace lt::dht; +using namespace std::placeholders; + +using namespace std::literals::chrono_literals; + +lt::clock_type::duration const min_request_interval = 5min; + +#ifdef TORRENT_DISABLE_DHT + +int main(int, char*[]) +{ + std::cerr << "not built with DHT support\n"; + return 1; +} + +#else + +namespace { + +std::atomic_bool quit(false); + +void stop(int) { quit = true; } + +[[noreturn]] void usage() +{ + std::cerr << "USAGE: dht-sample\n"; + exit(1); +} + +lt::session_params load_dht_state() +{ + std::fstream f(".dht", std::ios_base::in | std::ios_base::binary); + f.unsetf(std::ios_base::skipws); + std::cout << "load dht state from .dht\n"; + std::vector const state(std::istream_iterator{f} + , std::istream_iterator{}); + + if (f.bad() || state.empty()) + { + std::cerr << "failed to read .dht\n"; + return {}; + } + return read_session_params(state); +} + +struct node_entry +{ + lt::time_point next_request = lt::min_time(); + lt::time_point last_seen = lt::clock_type::now(); +}; + +std::set info_hashes; + +void new_torrent(lt::session& ses, lt::sha1_hash ih) +{ + if (!info_hashes.insert(ih).second) return; + + lt::add_torrent_params adp; + adp.info_hashes.v1 = ih; + adp.save_path = "./non-existant-path"; + adp.file_priorities.resize(1000, lt::dont_download); + adp.flags = torrent_flags::upload_mode; + ses.async_add_torrent(adp); +} +} // anonymous namespace + +int main(int argc, char*[]) +{ + if (argc != 1) usage(); + + namespace fs = std::filesystem; + + // list the directory of existing torrents, to populate our list. + for(auto const& p : fs::directory_iterator("torrents")) + { + std::string file = p.path().stem().string(); + if (file.size() == 40) + { + // v1 torrent + std::stringstream str(file); + lt::sha1_hash ih; + str >> ih; + info_hashes.insert(ih); + } + else if (file.size() == 64) + { + // v2 torrent (recorded as truncated hashes) + std::stringstream str(file); + lt::sha256_hash ih; + str >> ih; + info_hashes.insert(sha1_hash(span(ih.data(), 20))); + } + else if (file.size() == 40 + 1 + 64) + { + // hybrid torrent + std::stringstream str(file); + lt::sha1_hash ih; + str >> ih; + info_hashes.insert(ih); + } + } + std::cout << "know about " << info_hashes.size() << " torrents\n"; + + signal(SIGINT, &stop); + signal(SIGTERM, &stop); + + session_params sp = load_dht_state(); + sp.settings.set_bool(settings_pack::enable_lsd, false); + sp.settings.set_bool(settings_pack::enable_dht, true); + sp.settings.set_int(settings_pack::alert_mask + , lt::alert_category::error + | lt::alert_category::storage + | lt::alert_category::status + | lt::alert_category::dht_log + | lt::alert_category::dht_operation + | lt::alert_category::dht); + sp.settings.set_int(settings_pack::active_limit, 10000); + sp.settings.set_int(settings_pack::active_dht_limit, 10000); + sp.settings.set_int(settings_pack::active_downloads, 10000); + sp.settings.set_int(settings_pack::dht_announce_interval, 120); + sp.settings.set_int(settings_pack::alert_queue_size, 10000); + lt::session s(sp); + + lt::time_point next_send = lt::clock_type::now() + 5s; + lt::time_point next_node_prune = lt::clock_type::now() + 30min; + lt::time_point next_torrent_prune = lt::clock_type::now() + 6h; + std::map nodes; + + while (!quit) + { + s.wait_for_alert(5s); + + std::vector alerts; + s.pop_alerts(&alerts); + auto const now = lt::clock_type::now(); + for (alert* a : alerts) + { + if (auto* sa = lt::alert_cast(a)) + { + std::cout << "DHT sample response: " << sa->samples().size() << '\n'; + for (auto const& ih : sa->samples()) + new_torrent(s, ih); + + for (auto const& n : sa->nodes()) + { + auto it = nodes.find(n.second); + if (it == nodes.end()) + it = nodes.insert({n.second, {}}).first; + else + it->second.last_seen = now; + it->second.next_request = now + std::max(sa->interval + , min_request_interval); + } + std::cout.flush(); + } + else if (auto* dp = alert_cast(a)) + { + auto it = nodes.find(dp->node); + if (it == nodes.end()) + nodes.insert({dp->node, {}}); + else + it->second.last_seen = now; + // it's too verbose to print these + continue; + } + else if (auto* aa = alert_cast(a)) + { + new_torrent(s, aa->info_hash); + } + else if (auto* p = alert_cast(a)) + { + torrent_handle const& h = p->handle; + h.save_resume_data(torrent_handle::save_info_dict); + } + else if (auto* rd = alert_cast(a)) + { + auto const& atp = rd->params; + std::vector buf = write_resume_data_buf(atp); + + std::stringstream filename; + filename << "torrents/"; + if (atp.info_hashes.has_v1()) + filename << atp.info_hashes.v1; + if (atp.info_hashes.has_v2()) + { + if (atp.info_hashes.has_v1()) + filename << "-"; + filename << atp.info_hashes.v2; + } + filename << ".torrent"; + std::fstream f(filename.str(), std::ios_base::out | std::ios_base::binary | std::ios_base::trunc); + f.write(buf.data(), static_cast(buf.size())); + s.remove_torrent(rd->handle); + std::cout << "saved torrent: " << filename.str() << '\n'; + // don't log this + continue; + } + else if (alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a) + || alert_cast(a)) + { + // it's too verbose to print these + continue; + } + std::cout << a->message() << '\n'; + } + + if (now > next_send) + { + next_send = now + 1s; + auto const it = std::find_if(nodes.begin(), nodes.end() + , [now](std::pair const& n) + { return n.second.next_request < now; }); + if (it != nodes.end()) + { + // just push this forward. If we get a response, this will be + // updated with the interval announced by the node + it->second.next_request = now + 1h; + sha1_hash target; + for (auto& b : target) b = std::uint8_t(std::rand()); + s.dht_sample_infohashes(it->first, target); + } + } + + if (now > next_node_prune) + { + next_node_prune = now + 30min; + + // remove any node that we haven't seen in 6 hours + for (auto it = nodes.begin(); it != nodes.end();) + { + if (it->second.last_seen + 6h < now) + it = nodes.erase(it); + else + ++it; + } + } + + // regularly, remove torrents that are too old, and probably won't + // receive metadata + if (now > next_torrent_prune) + { + next_torrent_prune = now + 6h; + std::vector const all_torrents = s.get_torrent_status([] (lt::torrent_status const&) -> bool { return true; }); + + std::time_t const ptime_now = ::time(nullptr); + for (auto st : all_torrents) + { + if (ptime_now - st.added_time > 12 * 3600) + { + s.remove_torrent(st.handle); + std::cout << "failed to receive metadata: " << st.info_hashes << '\n'; + } + } + } + } + + std::vector const state = write_session_params_buf(s.session_state(session::save_dht_state)); + std::fstream f(".dht", std::ios_base::out | std::ios_base::binary | std::ios_base::trunc); + f.write(state.data(), static_cast(state.size())); + + return 0; +} + +#endif