diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index ddfaf8bc96c..b6aa6d0982a 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -137,6 +137,7 @@ static DB::PipeFDs signal_pipe; #if USE_BREAKPAD static bool use_minidump = true; +static std::atomic_bool init_minidump{false}; static std::shared_ptr descriptor; static std::shared_ptr eh; @@ -192,7 +193,9 @@ static void call_default_signal_handler(int sig) { signal(sig, SIG_DFL); #if USE_BREAKPAD - if (use_minidump) + bool not_init = false; + /// Only initialize minidump once + if (use_minidump && init_minidump.compare_exchange_strong(not_init, true)) eh = std::shared_ptr( new google_breakpad::ExceptionHandler(*descriptor, nullptr, dumpCallbackError, nullptr, true, -1)); #endif diff --git a/docker/CI/docker-compose-nexusfs.yml b/docker/CI/docker-compose-nexusfs.yml new file mode 100644 index 00000000000..c4dda2deba8 --- /dev/null +++ b/docker/CI/docker-compose-nexusfs.yml @@ -0,0 +1,174 @@ +version: "3" + +services: + # After upgrade to docker-compose v2, we could use `include` instead of `extend`. + hdfs-namenode: + extends: + file: ./common/hdfs.yml + service: hdfs-namenode + hdfs-datanode: + extends: + file: ./common/hdfs.yml + service: hdfs-datanode + fdb: + extends: + file: ./common/fdb.yml + service: fdb + my_mysql: + extends: + file: ./common/mysql.yml + service: my_mysql + tso: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "fdbcli -C /config/fdb.cluster --exec \"configure new single ssd\"; tso-server --config-file /config/tso.yml" + depends_on: + - fdb + - hdfs-namenode + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/tso/:/var/log/byconity/:rw + environment: &env + LD_LIBRARY_PATH: /opt/byconity/lib + PATH: /opt/byconity/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + ASAN_OPTIONS: + TSAN_OPTIONS: + IS_CI_ENV: 1 + CI_PIPELINE_NAME: CI + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:18845"] + interval: 5s + + server-0: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)" + depends_on: + tso: + condition: service_healthy + ports: + - "9000:52145" + - "127.0.0.1:8123:21557" + - "127.0.0.1:9004:9004" + environment: + <<: *env + SERVER_ID: server-0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/server-0/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:21557"] + interval: 5s + + server-1: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)" + depends_on: + tso: + condition: service_healthy + ports: + - "9001:52145" + - "127.0.0.1:8124:21557" + environment: + <<: *env + SERVER_ID: server-1 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/server-1/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:52145"] + interval: 5s + + worker-write: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "clickhouse-server --config-file /config/worker.yml" + depends_on: + - server-0 + - server-1 + ports: + - "52149:52145" + environment: + <<: *env + WORKER_GROUP_ID: wg_write + VIRTUAL_WAREHOUSE_ID: vw_write + WORKER_ID: w0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/worker-write/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + worker-default: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/worker.yml & clickhouse-server --config-file /config/worker.yml)" + depends_on: + - server-0 + - server-1 + environment: + <<: *env + WORKER_GROUP_ID: wg_default + VIRTUAL_WAREHOUSE_ID: vw_default + WORKER_ID: r0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/worker-default/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + daemon-manager: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "daemon-manager --config-file ./config/daemon-manager.yml" + depends_on: + server-0: + condition: service_healthy + server-1: + condition: service_healthy + environment: + <<: *env + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/daemon-manager/:/var/log/byconity/:rw + cap_add: + - SYS_PTRACE + restart: always + + resource-manager: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "resource-manager --config-file /config/resource-manager.yml" + depends_on: + - tso + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/rm/:/var/log/byconity/:rw + environment: + <<: *env + cap_add: + - SYS_PTRACE + +volumes: + fdb-data: + external: false + hdfs-namenode: + external: false + hdfs-datanode: + external: false diff --git a/docker/CI/nexusfs/conf.d/catalog.yml b/docker/CI/nexusfs/conf.d/catalog.yml new file mode 100644 index 00000000000..7ddd7231874 --- /dev/null +++ b/docker/CI/nexusfs/conf.d/catalog.yml @@ -0,0 +1,6 @@ +catalog: + name_space: default +catalog_service: + type: fdb + fdb: + cluster_file: /config/fdb.cluster diff --git a/docker/CI/nexusfs/conf.d/service_discovery.yml b/docker/CI/nexusfs/conf.d/service_discovery.yml new file mode 100644 index 00000000000..7627487161e --- /dev/null +++ b/docker/CI/nexusfs/conf.d/service_discovery.yml @@ -0,0 +1,115 @@ +service_discovery: + mode: local + cluster: default + disable_cache: false + cache_timeout: 5 + server: + psm: data.cnch.server + node: + - host: server-0 + hostname: server-0 + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + - host: server-1 + hostname: server-1 + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + tso: + psm: data.cnch.tso + node: + host: tso + hostname: tso + ports: + port: + - name: PORT0 + value: 18845 + - name: PORT2 + value: 9181 + resource_manager: + psm: data.cnch.resource_manager + node: + host: resource-manager + hostname: resource-manager + ports: + port: + name: PORT0 + value: 28989 + daemon_manager: + psm: data.cnch.daemon_manager + node: + host: daemon-manager + hostname: daemon-manager + ports: + port: + name: PORT0 + value: 17553 + vw_psm: data.cnch.vw + vw: + psm: data.cnch.vw + node: + - host: worker-write + hostname: worker-write + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + vw_name: vw_write + - host: worker-default + hostname: worker-default + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + vw_name: vw_default diff --git a/docker/CI/nexusfs/conf.d/storage.yml b/docker/CI/nexusfs/conf.d/storage.yml new file mode 100644 index 00000000000..020132e7b53 --- /dev/null +++ b/docker/CI/nexusfs/conf.d/storage.yml @@ -0,0 +1,18 @@ +hdfs_addr: hdfs://hdfs-namenode:9000 +storage_configuration: + disks: + hdfs_disk: + path: /user/clickhouse/ + type: bytehdfs + local_disk: + path: /var/byconity/data/ + type: local + policies: + default: + volumes: + hdfs: + default: hdfs_disk + disk: hdfs_disk + local: + default: local_disk + disk: local_disk diff --git a/docker/CI/nexusfs/daemon-manager.yml b/docker/CI/nexusfs/daemon-manager.yml new file mode 100644 index 00000000000..c4cbe3dcbf3 --- /dev/null +++ b/docker/CI/nexusfs/daemon-manager.yml @@ -0,0 +1,63 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +listen_host: "0.0.0.0" +cnch_type: server +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +daemon_manager: + port: 17553 + daemon_jobs: + job: + - name: PART_GC + interval: 10000 + disable: 0 + - name: PART_MERGE + interval: 10000 + disable: 0 + - name: CONSUMER + interval: 10000 + disable: 0 + - name: GLOBAL_GC + interval: 5000 + disable: 1 + - name: PART_CLUSTERING + interval: 30000 + disable: 0 + - name: DEDUP_WORKER + interval: 3000 + disable: 0 + # Increasing the frequency of recycling in a test environment + - name: TXN_GC + interval: 3000 + disable: 0 diff --git a/docker/CI/nexusfs/fdb.cluster b/docker/CI/nexusfs/fdb.cluster new file mode 100644 index 00000000000..b04f02bc3b5 --- /dev/null +++ b/docker/CI/nexusfs/fdb.cluster @@ -0,0 +1 @@ +docker:docker@fdb:4550 diff --git a/docker/CI/nexusfs/resource-manager.yml b/docker/CI/nexusfs/resource-manager.yml new file mode 100644 index 00000000000..b53233f1d0f --- /dev/null +++ b/docker/CI/nexusfs/resource-manager.yml @@ -0,0 +1,29 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +listen_host: "0.0.0.0" +path: /var/byconity/ +timezone: Europe/Moscow +perQuery: 1 +resource_manager: + port: 28989 + vws: + vw: + - name: vw_default + type: default + num_workers: 1 + worker_groups: + worker_group: + name: wg_default + type: Physical + - name: vw_write + type: write + num_workers: 1 + worker_groups: + worker_group: + name: wg_write + type: Physical diff --git a/docker/CI/nexusfs/server.yml b/docker/CI/nexusfs/server.yml new file mode 100644 index 00000000000..f03178bd0e2 --- /dev/null +++ b/docker/CI/nexusfs/server.yml @@ -0,0 +1,105 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 + console: true +additional_services: + GIS: 1 + VectorSearch: 1 + FullTextSearch: 1 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +mysql_port: 9004 +listen_host: "0.0.0.0" +prometheus: + endpoint: "/metrics" + port: 0 + metrics: true + events: true + asynchronous_metrics: true + part_metrics: false +cnch_type: server +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +enable_tenant_systemdb: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +nexus_fs: + enable: 1 + use_memory_device: 0 + enable_async_io: 0 + cache_size: 5368709120 + region_size: 4194304 + segment_size: 524288 + enable_memory_buffer: 1 + memory_buffer_size: 1073741824 + clean_regions_pool: 16 + clean_region_threads: 4 + num_in_mem_buffers: 32 + reader_threads: 32 +merge_tree: + reorganize_marks_data_layout: 1 + enable_nexus_fs: 1 +cnch_kafka_log: + database: cnch_system + table: cnch_kafka_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +cnch_unique_table_log: + database: cnch_system + table: cnch_unique_table_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +cnch_query_log: + database: cnch_system + table: cnch_query_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +query_log: + database: system + table: query_log + flush_interval_milliseconds: 15000 + partition_by: event_date +part_allocation_algorithm: 1 +consistent_hash_ring: + num_replicas: 16 + num_probes: 21 + load_factor: 1.3 +udf_path: /var/byconity/data/user_defined +udf_manager_server: + timeout_ms: 20000 + max_retry: 1 +udf_processor: + count: 3 + uds_path: /dev/shm/udf_processor_server + timeout_ms: 10000 + max_retry: 1 +custom_settings_prefixes: SQL_ +restrict_tenanted_users_to_whitelist_settings: false +restrict_tenanted_users_to_privileged_operations: false +sensitive_permission_tenants: 1234 diff --git a/docker/CI/nexusfs/tso.yml b/docker/CI/nexusfs/tso.yml new file mode 100644 index 00000000000..095eb2ebe7e --- /dev/null +++ b/docker/CI/nexusfs/tso.yml @@ -0,0 +1,22 @@ +logger: + level: trace + log: /var/log/byconity/tso.log + errorlog: /var/log/byconity/tso.err.log + testlog: /var/log/byconity/tso.test.log + size: 1000M + count: 10 + console: false +listen_host: "0.0.0.0" +path: /var/byconity/tso +tmp_path: /var/byconity/tmp +tso_service: + type: fdb + fdb: + cluster_file: /config/fdb.cluster + port: 18845 + http: + port: 9181 + receive_timeout: 1800 + send_timeout: 1800 + tso_window_ms: 3000 + tso_get_leader_info_interval_ms: 0 diff --git a/docker/CI/nexusfs/users.yml b/docker/CI/nexusfs/users.yml new file mode 100644 index 00000000000..61e2e5a63d0 --- /dev/null +++ b/docker/CI/nexusfs/users.yml @@ -0,0 +1,38 @@ +profiles: + default: + load_balancing: random + log_queries: 1 + max_execution_time: 180 + exchange_timeout_ms: 300000 + enable_nexus_fs: 1 + +users: + default: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + access_management: 1 + server: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + probe: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + +quotas: + default: + interval: + duration: 3600 + queries: 0 + errors: 0 + result_rows: 0 + read_rows: 0 + execution_time: 0 \ No newline at end of file diff --git a/docker/CI/nexusfs/worker.yml b/docker/CI/nexusfs/worker.yml new file mode 100644 index 00000000000..a97e011eb56 --- /dev/null +++ b/docker/CI/nexusfs/worker.yml @@ -0,0 +1,82 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +listen_host: "0.0.0.0" +cnch_type: worker +vw_name: vw_default +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +enable_tenant_systemdb: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +nexus_fs: + enable: 1 + use_memory_device: 0 + enable_async_io: 0 + cache_size: 5368709120 + region_size: 4194304 + segment_size: 524288 + enable_memory_buffer: 1 + memory_buffer_size: 1073741824 + clean_regions_pool: 16 + clean_region_threads: 4 + num_in_mem_buffers: 32 + reader_threads: 32 +merge_tree: + reorganize_marks_data_layout: 1 + enable_nexus_fs: 1 +cnch_unique_table_log: + database: cnch_system + table: cnch_unique_table_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +query_log: + database: system + table: query_log + flush_interval_milliseconds: 15000 + partition_by: event_date +udf_path: /var/byconity/data/user_defined +udf_manager_server: + timeout_ms: 20000 + max_retry: 1 +udf_processor: + count: 3 + uds_path: /dev/shm/udf_processor_worker + timeout_ms: 10000 + max_retry: 1 +restrict_tenanted_users_to_system_tables: false +restrict_tenanted_users_to_whitelist_settings: false +restrict_tenanted_users_to_privileged_operations: false +additional_services: + FullTextSearch: true + VectorSearch: true + GIS: true +sensitive_permission_tenants: 1234 diff --git a/src/CloudServices/CnchServerResource.cpp b/src/CloudServices/CnchServerResource.cpp index 3f0b7527cef..f48689df9a2 100644 --- a/src/CloudServices/CnchServerResource.cpp +++ b/src/CloudServices/CnchServerResource.cpp @@ -420,11 +420,14 @@ void CnchServerResource::sendResources(const ContextPtr & context, std::optional max_threads = std::min(max_threads, all_resources.size()); ExceptionHandler exception_handler; ThreadPool thread_pool(max_threads); + std::mutex call_ids_mutex; for (auto & all_resource : all_resources) { thread_pool.scheduleOrThrowOnError(createExceptionHandledJob( [&]() { - call_ids.emplace_back(doAsyncSend(context, all_resource.first, all_resource.second, handler)); + auto call_id = doAsyncSend(context, all_resource.first, all_resource.second, handler); + std::unique_lock lock(call_ids_mutex); + call_ids.push_back(call_id); }, exception_handler)); } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index f8a6b3bb49f..b945334886a 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -116,9 +116,7 @@ M(79, INCORRECT_FILE_NAME) \ M(80, INCORRECT_QUERY) \ M(81, UNKNOWN_DATABASE) \ - M(81, UNKNOWN_CATALOG) \ M(82, DATABASE_ALREADY_EXISTS) \ - M(82, CATALOG_ALREADY_EXISTS) \ M(83, DIRECTORY_DOESNT_EXIST) \ M(84, DIRECTORY_ALREADY_EXISTS) \ M(85, FORMAT_IS_NOT_SUITABLE_FOR_INPUT) \ @@ -703,6 +701,9 @@ M(800, MYSQL_EXCEPTION) \ M(801, UNSUPPORTED_MYSQL_TABLE) \ M(802, UNEXPECTED_MATERIALIZED_MYSQL_STATE) \ +\ + M(820, UNKNOWN_CATALOG) \ + M(821, CATALOG_ALREADY_EXISTS) \ \ M(875, NO_SUCH_SERVICE) \ M(876, NO_AVAILABLE_CONSUMER) \ diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 46937f71b91..82210de3376 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -1114,21 +1114,55 @@ M(RegionManagerNumInMemBufCleanupRetries, "RegionManager number of in-memory buffer cleanup retries") \ M(RegionManagerCleanRegionRetries, "RegionManager number of clean region retries") \ \ - M(NexusFSDiskCacheHit, "NexusFS disk cache hits") \ - M(NexusFSDiskCacheHitInflightInsert, "NexusFS disk cache hits on in-flight inserts") \ - M(NexusFSDiskCacheMiss, "NexusFS disk cache misses") \ + M(NexusFSHit, "NexusFS hits") \ + M(NexusFSHitInflightInsert, "NexusFS hits on in-flight inserts") \ + M(NexusFSMiss, "NexusFS missed") \ + M(NexusFSPreload, "NexusFS preloads") \ + M(NexusFSDeepRetry, "NexusFS deep retries") \ M(NexusFSDiskCacheEvict, "NexusFS disk cache evicts") \ - M(NexusFSDiskCachePreload, "NexusFS disk cache preloads") \ - M(NexusFSDiskCacheLookupRetries, "NexusFS disk cache retries in lookup") \ - M(NexusFSDiskCacheInsertRetries, "NexusFS disk cache retries in insert") \ + M(NexusFSDiskCacheInsertRetries, "NexusFS disk cache retries when insert") \ M(NexusFSDiskCacheError, "NexusFS disk cache errors") \ - M(NexusFSDiskCacheBytesRead, "NexusFS disk cache total bytes read") \ - M(NexusFSDiskCacheBytesWrite, "NexusFS disk cache total bytes write") \ - M(NexusFSMemoryBufferHit, "NexusFS memory buffer hits") \ - M(NexusFSMemoryBufferMiss, "NexusFS memory buffer misses") \ - M(NexusFSMemoryBufferEvict, "NexusFS memory buffer evicts") \ - M(NexusFSMemoryBufferError, "NexusFS memory buffer errors") \ - M(NexusFSMemoryBufferBytesRead, "NexusFS memory buffer total bytes read") \ + M(NexusFSDiskCacheBytesRead, "NexusFS disk cache bytes read") \ + M(NexusFSDiskCacheBytesWrite, "NexusFS disk cache bytes write") \ + M(NexusFSReadFromInsertCxt, "NexusFS ReadFromInsertCxt successes") \ + M(NexusFSReadFromInsertCxtRetry, "NexusFS ReadFromInsertCxt retries") \ + M(NexusFSReadFromInsertCxtDeepRetry, "NexusFS ReadFromInsertCxt deep retries") \ + M(NexusFSReadFromInsertCxtBytesRead, "NexusFS ReadFromInsertCxt bytes read") \ + M(NexusFSReadFromInsertCxtNonCopy, "NexusFS ReadFromInsertCxt by non-copying method successes") \ + M(NexusFSReadFromInsertCxtNonCopyBytesRead, "NexusFS ReadFromInsertCxt by non-copying method bytes read") \ + M(NexusFSReadFromDisk, "NexusFS ReadFromDisk successes") \ + M(NexusFSReadFromDiskRetry, "NexusFS ReadFromDisk retries") \ + M(NexusFSReadFromDiskDeepRetry, "NexusFS ReadFromDisk deep retries") \ + M(NexusFSReadFromDiskBytesRead, "NexusFS ReadFromDisk bytes read") \ + M(NexusFSReadFromBuffer, "NexusFS ReadFromBuffer successes") \ + M(NexusFSReadFromBufferRetry, "NexusFS ReadFromBuffer retries") \ + M(NexusFSReadFromBufferDeepRetry, "NexusFS ReadFromBuffer deep retries") \ + M(NexusFSReadFromBufferBytesRead, "NexusFS ReadFromBuffer bytes read") \ + M(NexusFSReadFromBufferNonCopy, "NexusFS ReadFromBuffer by non-copying method successes") \ + M(NexusFSReadFromBufferNonCopyBytesRead, "NexusFS ReadFromBuffer by non-copying method bytes read") \ + M(NexusFSReadFromSourceBytesRead, "NexusFS bytes read from source") \ + M(NexusFSReadFromSourceMicroseconds, "NexusFS read from source microseconds") \ + M(NexusFSTimeout, "NexusFS read timeouts") \ + M(NexusFSPrefetchToBuffer, "NexusFS PrefetchToBuffer successes") \ + M(NexusFSPrefetchToBufferBytesRead, "NexusFS PrefetchToBuffer bytes read") \ + M(NexusFSBufferHit, "NexusFS buffer hits") \ + M(NexusFSBufferMiss, "NexusFS buffer misses") \ + M(NexusFSBufferPreload, "NexusFS buffer preloads") \ + M(NexusFSBufferPreloadRetry, "NexusFS buffer retries in preload") \ + M(NexusFSBufferEmptyCoolingQueue, "NexusFS buffer cooling queue empty") \ + M(NexusFSInodeManagerLookupMicroseconds, "NexusFS InodeManager lookup microseconds") \ + M(NexusFSInodeManagerInsertMicroseconds, "NexusFS InodeManager insert microseconds") \ +\ + M(ReadFromNexusFSReadBytes, "Read bytes from nuxusfs.") \ + M(ReadFromNexusFSSeeks, "Total number of seeks for async buffer") \ + M(ReadFromNexusFSPrefetchRequests, "Number of prefetches made with asynchronous reading from nuxusfs") \ + M(ReadFromNexusFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \ + M(ReadFromNexusFSPrefetchedReads, "Number of reads from prefetched buffer") \ + M(ReadFromNexusFSPrefetchTaskWait, "Number of waiting when reading from prefetched buffer") \ + M(ReadFromNexusFSPrefetchTaskNotWait, "Number of not waiting when reading from prefetched buffer") \ + M(ReadFromNexusFSPrefetchedBytes, "Number of bytes from prefetched buffer") \ + M(ReadFromNexusFSAsynchronousWaitMicroseconds, "Time spent in waiting for asynchronous nuxusfs reads.") \ + M(ReadFromNexusFSSynchronousWaitMicroseconds, "Time spent in waiting for synchronous nuxusfs reads.") \ \ M(TSORequest, "Number requests sent to TSO") \ M(TSORequestMicroseconds, "Total time spent in get timestamp from TSO") \ @@ -1195,6 +1229,9 @@ M(GinIndexFilterResultCacheHit, "Number of posting list result cache hit") \ M(GinIndexFilterResultCacheMiss, "Number of posting list result cache miss") \ M(PrimaryAndSecondaryIndexFilterTime, "Time used in primary index and secondary indices filterr, in micro seconds") \ +\ + M(TableFinishStepPreClearHDFSTableMicroseconds, "") \ + M(TableFinishStepPreClearS3TableMicroseconds, "") \ namespace ProfileEvents { @@ -1243,13 +1280,20 @@ uint64_t Counters::getIOReadTime(bool use_async_read) const if (use_async_read) { return counters[ProfileEvents::RemoteFSAsynchronousReadWaitMicroseconds] - + counters[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] + counters[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] + + counters[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds]; } // Else, we calculate the origin read IO time else { - return counters[ProfileEvents::HDFSReadElapsedMicroseconds] + counters[ProfileEvents::ReadBufferFromS3ReadMicroseconds] - + counters[ProfileEvents::DiskReadElapsedMicroseconds]; + return counters[ProfileEvents::HDFSReadElapsedMicroseconds] + + counters[ProfileEvents::ReadBufferFromS3ReadMicroseconds] + + counters[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds] + - counters_holder[ProfileEvents::NexusFSReadFromSourceMicroseconds]; } } @@ -1269,14 +1313,19 @@ uint64_t Counters::Snapshot::getIOReadTime(bool use_async_read) const { return counters_holder[ProfileEvents::RemoteFSAsynchronousReadWaitMicroseconds] + counters_holder[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] - + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds]; } // Else, we calculate the origin read IO time else { return counters_holder[ProfileEvents::HDFSReadElapsedMicroseconds] + counters_holder[ProfileEvents::ReadBufferFromS3ReadMicroseconds] - + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds] + - counters_holder[ProfileEvents::NexusFSReadFromSourceMicroseconds]; } } diff --git a/src/Common/tests/gtest_global_context.h b/src/Common/tests/gtest_global_context.h index 4302c047c46..166553b8744 100644 --- a/src/Common/tests/gtest_global_context.h +++ b/src/Common/tests/gtest_global_context.h @@ -121,14 +121,16 @@ inline const ContextHolder & getContext() return holder; } -inline void setQueryDuration() +inline void setQueryDuration(DB::ContextMutablePtr context = nullptr) { - auto & context = getContext().context; + if (!context) + context = getContext().context; + auto & client_info = context->getClientInfo(); const auto current_time = std::chrono::system_clock::now(); client_info.initial_query_start_time = time_in_seconds(current_time); client_info.initial_query_start_time_microseconds = time_in_microseconds(current_time); - context->setQueryExpirationTimeStamp(); + context->initQueryExpirationTimeStamp(); } diff --git a/src/Core/Defines.h b/src/Core/Defines.h index 3dcd57198ca..65c1cb15724 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -127,7 +127,6 @@ constexpr auto TOS_PSM = "toutiao.tos.tosapi"; #define DBMS_BRPC_PROTOCOL_MINOR_VERSION 4 #define TEST_KNOB_FORCE_META_REBUILD 0x08ull -#define ALL_TABLE_FALLBACK_CNCH_CATALOG "ALL_TABLE_FALLBACK_CNCH_CATALOG" /// SERVER VW constexpr auto DEFAULT_SERVER_VW_NAME = "server_vw_default"; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b44f720563f..97c36c0d521 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -189,7 +189,7 @@ enum PreloadLevelSettings : UInt64 M(UInt64, s3_max_request_ms, 30000, "Request max timeout ms , now it is just for CnchS3", 0) \ M(Bool, s3_skip_empty_files, false, "Allow to skip empty files in s3 table engine", 0) \ M(Bool, overwrite_current_file, false, "Enable overwrite current file, now it is just for CnchS3/CnchHDFS", 0) \ - M(Bool, insert_new_file, true, "Create new file when write data into the file, now it is just for CnchS3/CnchHDFS", 0) \ + M(Bool, insert_new_file, false, "Create new file when write data into the file, now it is just for CnchS3/CnchHDFS", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \ M(Bool, replace_running_query, false, "Whether the running request should be canceled with the same id as the new one.", 0) \ @@ -1233,13 +1233,9 @@ enum PreloadLevelSettings : UInt64 "Number of thread performing background parts info collection in PartCacheManager.", \ 0) \ M(String, username_for_internal_communication, "server", "Username to be used by server for authentication on worker side.", 0) \ - M(UInt64, \ - cnch_part_allocation_algorithm, \ - 2, \ - "Part allocation algorithm, 0: jump consistent hashing, 1: bounded hash ring consistent hashing, 2: strict ring consistent " \ - "hashing.", \ - 0) \ M(UInt64, cnch_max_cached_storage, 2048, "Cnch storage cache size.", 0) \ + M(Bool, enable_internal_communication_user, true, "Enable specified user used by server for authentication on worker side.", 0) \ + M(UInt64, cnch_part_allocation_algorithm, 2, "Part allocation algorithm, 0: jump consistent hashing, 1: bounded hash ring consistent hashing, 2: strict ring consistent hashing.", 0) \ M(Bool, enable_multiple_tables_for_cnch_parts, 0, "Allow to query multiple tables for system.cnch_parts", 0) \ M(Bool, enable_skip_non_cnch_tables_for_cnch_parts, true, "Allow to skip non cnch tables for system.cnch_parts", 0) \ M(Bool, enable_skip_non_cnch_tables_for_cnch_trash_items, true, "Allow to skip non cnch tables for system.cnch_trash_items", 0) \ @@ -1663,7 +1659,8 @@ enum PreloadLevelSettings : UInt64 M(Bool, enable_materialized_view_rewrite_verbose_log, false, "Whether enable materialized view based rewriter for query", 0) \ M(Bool, enable_materialized_view_empty_grouping_rewriting, true, "Whether enable materialized view based rewriter for query", 0) \ M(Bool, enable_materialized_view_join_rewriting, true, "Whether enable materialized view based rewriter for query using join materialized views", 0) \ - M(Bool, enable_materialized_view_union_rewriting, true, "Whether enable materialized view based rewriter for query using union", 0) \ + M(Bool, enable_materialized_view_union_rewriting, false, "Whether enable materialized view based rewriter for query using union", 0) \ + M(Bool, enforce_materialized_view_union_rewriting, false, "Enforce enable materialized view based rewriter for query using union, used for testing", 0) \ M(MaterializedViewConsistencyCheckMethod, materialized_view_consistency_check_method, MaterializedViewConsistencyCheckMethod::PARTITION, "The method to check whether a materialized view is consistent with the base table for a query", 0) \ M(Bool, enable_execute_query, true, "Whether to execute this query", 0) \ M(UInt64, max_plan_segment_num, 500, "maximum plan segments allowed, 0 means no restriction", 0)\ @@ -1784,7 +1781,6 @@ enum PreloadLevelSettings : UInt64 /* Transaction and catalog */ \ M(Bool, ignore_duplicate_insertion_label, true, "Throw an exception if false", 0) \ M(Bool, bypass_ddl_db_lock, true, "Bypass locking database while creating tables", 0) \ - M(String, fallback_use_cnch_catalog, ALL_TABLE_FALLBACK_CNCH_CATALOG, "fallback using cnch catalog to get table first when resolving database and table failed", 0) \ M(Bool, prefer_cnch_catalog, false, "Force using cnch catalog to get table first when resolving database and table", 0) \ M(Bool, enable_interactive_transaction, true, "Enable interactive transaction", 0) \ M(Bool, force_clean_transaction_by_dm, false, "Force clean transaction by dm, can be used for testing purpose", 0) \ @@ -1836,6 +1832,7 @@ enum PreloadLevelSettings : UInt64 M(Bool, bsp_shuffle_reduce_locality_enabled, false, "Whether to compute locality preferences for reduce tasks", 0) \ M(Float, bsp_shuffle_reduce_locality_fraction, 0.2, "Fraction of total map output that must be at a location for it to considered as a preferred location for a reduce task", 0) \ M(UInt64, bsp_max_retry_num, 3, "max retry number for a task(plan segment instance) in bsp mode, does not include first execution(i.e. normal execution without retry)",0) \ + M(Bool, enable_resource_aware_scheduler, false, "Whether to check resource before scheduling a segment instance", 0) \ /*end of bulk synchronous parallel section*/ \ M(Bool, enable_io_scheduler, false, "Enable io scheduler", 0) \ M(Bool, enable_io_pfra, false, "Enable prefetch and read ahead for remote read", 0) \ diff --git a/src/DataStreams/PartitionedBlockOutputStream.cpp b/src/DataStreams/PartitionedBlockOutputStream.cpp index 9b1f1d06b07..7d8870ac4d3 100644 --- a/src/DataStreams/PartitionedBlockOutputStream.cpp +++ b/src/DataStreams/PartitionedBlockOutputStream.cpp @@ -1,6 +1,7 @@ #include "PartitionedBlockOutputStream.h" #include +#include #include #include @@ -14,6 +15,7 @@ #include #include +#include namespace DB @@ -25,13 +27,13 @@ namespace ErrorCodes PartitionedBlockOutputStream::PartitionedBlockOutputStream( const ContextPtr & context_, const ASTPtr & partition_by, const Block & sample_block_) - : global_context(context_), sample_block(sample_block_) + : query_context(context_), sample_block(sample_block_) { std::vector arguments(1, partition_by); ASTPtr partition_by_string = makeASTFunction(FunctionToString::name, std::move(arguments)); - auto syntax_result = TreeRewriter(global_context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); - partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, global_context).getActions(false); + auto syntax_result = TreeRewriter(query_context).analyze(partition_by_string, sample_block.getNamesAndTypesList()); + partition_by_expr = ExpressionAnalyzer(partition_by_string, syntax_result, query_context).getActions(false); partition_by_column_name = partition_by_string->getColumnName(); } @@ -132,9 +134,10 @@ void PartitionedBlockOutputStream::validatePartitionKey(const String & str, bool } -String PartitionedBlockOutputStream::replaceWildcards(const String & haystack, const String & partition_id) +String PartitionedBlockOutputStream::replaceWildcards(const String & haystack, const String & partition_id, UInt32 parallel_index) { - return boost::replace_all_copy(haystack, PartitionedBlockOutputStream::PARTITION_ID_REPLACE, fmt::format("{}_{}", partition_id, getPodOrHostName())); + String replace_str + = partition_id.empty() ? fmt::format("{}_{}", parallel_index, getPodOrHostName()) : fmt::format("{}_{}_{}", partition_id, parallel_index, getPodOrHostName()); + return boost::replace_all_copy(haystack, PartitionedBlockOutputStream::PARTITION_ID_REPLACE, replace_str); } - } diff --git a/src/DataStreams/PartitionedBlockOutputStream.h b/src/DataStreams/PartitionedBlockOutputStream.h index f4e4897b4bd..35818b82d8b 100644 --- a/src/DataStreams/PartitionedBlockOutputStream.h +++ b/src/DataStreams/PartitionedBlockOutputStream.h @@ -36,9 +36,9 @@ class PartitionedBlockOutputStream : public IBlockOutputStream static void validatePartitionKey(const String & str, bool allow_slash); - static String replaceWildcards(const String & haystack, const String & partition_id); + static String replaceWildcards(const String & haystack, const String & partition_id, UInt32 parallel_index); - ContextPtr global_context; + ContextPtr query_context; // Note: make sure init by `query_context` but `global_context` Block sample_block; private: diff --git a/src/Disks/DiskByteS3.cpp b/src/Disks/DiskByteS3.cpp index 902f63bada1..53341de64c0 100644 --- a/src/Disks/DiskByteS3.cpp +++ b/src/Disks/DiskByteS3.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -247,20 +247,21 @@ std::unique_ptr DiskByteS3::readFile(const String & path { ReadSettings modified_settings{settings}; modified_settings.for_disk_s3 = true; + auto nexus_fs = settings.enable_nexus_fs ? Context::getGlobalContextInstance()->getNexusFS() : nullptr; + bool use_external_buffer = nexus_fs ? false : settings.remote_fs_prefetch; std::unique_ptr impl; - { - impl = std::make_unique( - s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, settings.remote_fs_prefetch); - } + impl = std::make_unique( + s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, use_external_buffer); - if (settings.enable_nexus_fs) + if (nexus_fs) { - auto nexus_fs = Context::getGlobalContextInstance()->getNexusFS(); - if (nexus_fs) - impl = std::make_unique(nexus_fs->getSegmentSize(), std::move(impl), *nexus_fs); + impl = std::make_unique( + settings.local_fs_buffer_size, + settings.remote_fs_prefetch, + std::move(impl), + *nexus_fs); } - - if (settings.remote_fs_prefetch) + else if (settings.remote_fs_prefetch) { auto impl = std::make_unique(s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, /* use_external_buffer */true); diff --git a/src/Disks/HDFS/DiskByteHDFS.cpp b/src/Disks/HDFS/DiskByteHDFS.cpp index 915d57deab8..7c60d065b1f 100644 --- a/src/Disks/HDFS/DiskByteHDFS.cpp +++ b/src/Disks/HDFS/DiskByteHDFS.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -237,18 +237,21 @@ std::unique_ptr DiskByteHDFS::readFile(const String & pa } else { + auto nexus_fs = settings.enable_nexus_fs ? Context::getGlobalContextInstance()->getNexusFS() : nullptr; + bool use_external_buffer = nexus_fs ? false : settings.remote_fs_prefetch; std::unique_ptr impl; + impl = std::make_unique( + file_path, hdfs_params, settings, nullptr, 0, use_external_buffer); - impl = std::make_unique(file_path, hdfs_params, settings, - nullptr, 0, /* use_external_buffer */ settings.remote_fs_prefetch); - if (settings.enable_nexus_fs) + if (nexus_fs) { - auto nexus_fs = Context::getGlobalContextInstance()->getNexusFS(); - if (nexus_fs) - impl = std::make_unique(nexus_fs->getSegmentSize(), std::move(impl), *nexus_fs); + impl = std::make_unique( + settings.local_fs_buffer_size, + settings.remote_fs_prefetch, + std::move(impl), + *nexus_fs); } - - if (settings.remote_fs_prefetch) + else if (settings.remote_fs_prefetch) { auto global_context = Context::getGlobalContextInstance(); auto reader = global_context->getThreadPoolReader(); diff --git a/src/Functions/LeastGreatestGeneric.h b/src/Functions/LeastGreatestGeneric.h index a8bab0efd54..df44ff87762 100644 --- a/src/Functions/LeastGreatestGeneric.h +++ b/src/Functions/LeastGreatestGeneric.h @@ -107,6 +107,8 @@ class LeastGreatestOverloadResolver : public IFunctionOverloadResolver FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override { DataTypes argument_types; + for (const auto & argument : arguments) + argument_types.push_back(argument.type); /// More efficient specialization for two numeric arguments. if (arguments.size() == 2 && isNumber(arguments[0].type) && isNumber(arguments[1].type)) diff --git a/src/IO/ReadBufferFromFileWithNexusFS.cpp b/src/IO/ReadBufferFromFileWithNexusFS.cpp deleted file mode 100644 index 55f1190900a..00000000000 --- a/src/IO/ReadBufferFromFileWithNexusFS.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int SEEK_POSITION_OUT_OF_BOUND; -} - -ReadBufferFromFileWithNexusFS::ReadBufferFromFileWithNexusFS( - size_t buf_size, - std::unique_ptr source_read_buffer_, - NexusFS &nexus_fs_) - : ReadBufferFromFileBase(buf_size, nullptr, 0) - , file_name(source_read_buffer_->getFileName()) - , source_read_buffer(std::move(source_read_buffer_)) - , nexus_fs(nexus_fs_) -{ -} - -bool ReadBufferFromFileWithNexusFS::nextImpl() -{ - if (read_until_position) - { - if (read_until_position == offset) - return false; - - if (read_until_position < offset) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); - } - } - - /// If internal_buffer size is empty, then read() cannot be distinguished from EOF - chassert(!internal_buffer.empty()); - - size_t max_size_to_read = internal_buffer.size(); - if (read_until_position) - { - max_size_to_read = std::min(max_size_to_read, static_cast(read_until_position - offset)); - } - - size_t bytes_read = nexus_fs.read(file_name, offset, max_size_to_read, source_read_buffer, internal_buffer.begin()); - - if (bytes_read) - { - working_buffer = internal_buffer; - working_buffer.resize(bytes_read); - offset += bytes_read; - return true; - } - - return false; -} - -off_t ReadBufferFromFileWithNexusFS::seek(off_t offset_, int whence) -{ - if (whence == SEEK_CUR) - offset_ = getPosition() + offset_; - else if (whence != SEEK_SET) - throw Exception("Seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::BAD_ARGUMENTS); - - if (offset_ < 0) - throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND); - - if (offset_ == getPosition()) - return offset_; - - if (!working_buffer.empty() - && static_cast(offset_) >= offset - working_buffer.size() - && offset_ < offset) - { - pos = working_buffer.end() - (offset - offset_); - assert(pos >= working_buffer.begin()); - assert(pos < working_buffer.end()); - - return getPosition(); - } - - resetWorkingBuffer(); - offset = offset_; - - return offset; -} - -IAsynchronousReader::Result ReadBufferFromFileWithNexusFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) -{ - bool result = false; - offset = read_offset; - set(data, size); - - if (ignore_bytes) - { - ignore(ignore_bytes); - result = hasPendingData(); - ignore_bytes = 0; - } - - if (!result) - result = next(); - - if (result) - { - assert(available()); - return { working_buffer.size(), BufferBase::offset(), nullptr }; - } - - return {0, 0, nullptr}; -} - -size_t ReadBufferFromFileWithNexusFS::readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) -{ - if (n == 0) - return 0; - - size_t bytes_read = nexus_fs.read(file_name, range_begin, n, source_read_buffer, to); - - if (bytes_read && progress_callback) - progress_callback(bytes_read); - return bytes_read; -} - -void ReadBufferFromFileWithNexusFS::setReadUntilPosition(size_t position) -{ - if (position != static_cast(read_until_position)) - { - offset = getPosition(); - resetWorkingBuffer(); - read_until_position = position; - } -} - -void ReadBufferFromFileWithNexusFS::setReadUntilEnd() -{ - if (read_until_position) - { - offset = getPosition(); - resetWorkingBuffer(); - read_until_position = 0; - } -} - -} diff --git a/src/IO/ReadBufferFromNexusFS.cpp b/src/IO/ReadBufferFromNexusFS.cpp new file mode 100644 index 00000000000..5b9759a6726 --- /dev/null +++ b/src/IO/ReadBufferFromNexusFS.cpp @@ -0,0 +1,295 @@ +#include +#include "Common/Exception.h" +#include "common/logger_useful.h" +#include + + +namespace CurrentMetrics +{ + extern const Metric AsynchronousReadWait; +} + +namespace ProfileEvents +{ + extern const Event ReadFromNexusFSReadBytes; + extern const Event ReadFromNexusFSAsynchronousWaitMicroseconds; + extern const Event ReadFromNexusFSSynchronousWaitMicroseconds; + extern const Event ReadFromNexusFSSeeks; + extern const Event ReadFromNexusFSPrefetchRequests; + extern const Event ReadFromNexusFSUnusedPrefetches; + extern const Event ReadFromNexusFSPrefetchedReads; + extern const Event ReadFromNexusFSPrefetchedBytes; + extern const Event ReadFromNexusFSPrefetchTaskWait; + extern const Event ReadFromNexusFSPrefetchTaskNotWait; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int SEEK_POSITION_OUT_OF_BOUND; +} + +ReadBufferFromNexusFS::ReadBufferFromNexusFS( + size_t buf_size_, + bool actively_prefetch_, + std::unique_ptr source_read_buffer_, + NexusFS &nexus_fs_) + : ReadBufferFromFileBase(nexus_fs_.supportNonCopyingRead() ? 0 : buf_size_, nullptr, 0) + , file_name(source_read_buffer_->getFileName()) + , source_read_buffer(std::move(source_read_buffer_)) + , nexus_fs(nexus_fs_) + , buf_size(buf_size_) + , read_to_internal_buffer(!nexus_fs_.supportNonCopyingRead()) + , actively_prefetch(actively_prefetch_) +{ +} + +ReadBufferFromNexusFS::~ReadBufferFromNexusFS() +{ + try + { + resetPrefetch(); + } + catch (Exception & e) + { + LOG_WARNING(log, "resetPrefetch raises exception: {}", e.message()); + } +} + +bool ReadBufferFromNexusFS::nextImpl() +{ + if (!hasPendingDataToRead()) + return false; + + if (!read_to_internal_buffer) + { + // read from nexusfs by non-copying method + nexusfs_buffer.reset(); + + // first, check if there prefetched data + if (prefetch_future.valid()) + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds); + CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait}; + + if (prefetch_future.wait_for(std::chrono::seconds(0)) == std::future_status::ready) + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchTaskNotWait); + else + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchTaskWait); + + nexusfs_buffer = prefetch_future.get(); + auto size = nexusfs_buffer.getSize(); + + prefetch_future = {}; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedReads); + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedBytes, size); + } + else + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds); + size_t max_size_to_read = read_until_position ? read_until_position - offset : buf_size; + nexusfs_buffer = nexus_fs.read( + file_name, + offset, + max_size_to_read, + source_read_buffer); + } + + size_t bytes_read = nexusfs_buffer.getSize(); + if (bytes_read == 0) + return false; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSReadBytes, bytes_read); + BufferBase::set(nexusfs_buffer.getData(), bytes_read, 0); + offset += bytes_read; + + if (actively_prefetch) + prefetch(Priority{0}); + + return true; + } + + size_t max_size_to_read = internal_buffer.size(); + if (read_until_position) + { + max_size_to_read = std::min(max_size_to_read, static_cast(read_until_position - offset)); + } + + size_t total_bytes_read = 0; + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds); + do + { + size_t bytes_read = nexus_fs.read( + file_name, + offset + total_bytes_read, + max_size_to_read - total_bytes_read, + source_read_buffer, + internal_buffer.begin() + total_bytes_read); + + if (bytes_read == 0) + break; + total_bytes_read += bytes_read; + } + while (total_bytes_read < max_size_to_read); + } + + if (total_bytes_read) + { + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSReadBytes, total_bytes_read); + working_buffer = internal_buffer; + working_buffer.resize(total_bytes_read); + offset += total_bytes_read; + return true; + } + + return false; +} + +off_t ReadBufferFromNexusFS::seek(off_t offset_, int whence) +{ + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSSeeks); + if (whence == SEEK_CUR) + offset_ = getPosition() + offset_; + else if (whence != SEEK_SET) + throw Exception("Seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::BAD_ARGUMENTS); + + if (offset_ < 0) + throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND); + + if (offset_ == getPosition()) + return offset_; + + if (!working_buffer.empty() + && static_cast(offset_) >= offset - working_buffer.size() + && offset_ < offset) + { + pos = working_buffer.end() - (offset - offset_); + assert(pos >= working_buffer.begin()); + assert(pos < working_buffer.end()); + + return getPosition(); + } + + resetWorkingBuffer(); + resetPrefetch(); + offset = offset_; + + return offset; +} + +IAsynchronousReader::Result ReadBufferFromNexusFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) +{ + bool result = false; + offset = read_offset; + set(data, size); + + auto original_status = read_to_internal_buffer; + read_to_internal_buffer = true; + + if (ignore_bytes) + { + ignore(ignore_bytes); + result = hasPendingData(); + ignore_bytes = 0; + } + if (!result) + result = next(); + + read_to_internal_buffer = original_status; + + if (result) + { + assert(available()); + return { working_buffer.size(), BufferBase::offset(), nullptr }; + } + + return {0, 0, nullptr}; +} + +bool ReadBufferFromNexusFS::hasPendingDataToRead() +{ + if (read_until_position) + { + if (read_until_position == offset) + return false; + + if (read_until_position < offset) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); + } + } + + return true; +} + +void ReadBufferFromNexusFS::prefetch(Priority) +{ + if (!nexus_fs.supportPrefetch()) + return; + + chassert(!read_to_internal_buffer); + + if (prefetch_future.valid()) + return; + + if (!hasPendingDataToRead()) + return; + + size_t max_size_to_read = read_until_position ? read_until_position - offset : buf_size; + + prefetch_future = nexus_fs.prefetchToBuffer(file_name, offset, max_size_to_read, source_read_buffer); + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchRequests); +} + +size_t ReadBufferFromNexusFS::readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) +{ + if (n == 0) + return 0; + + size_t bytes_read = nexus_fs.read(file_name, range_begin, n, source_read_buffer, to); + + if (bytes_read && progress_callback) + progress_callback(bytes_read); + return bytes_read; +} + +void ReadBufferFromNexusFS::setReadUntilPosition(size_t position) +{ + if (position != static_cast(read_until_position)) + { + offset = getPosition(); + resetWorkingBuffer(); + resetPrefetch(); + read_until_position = position; + } +} + +void ReadBufferFromNexusFS::setReadUntilEnd() +{ + if (read_until_position) + { + offset = getPosition(); + resetWorkingBuffer(); + read_until_position = 0; + } +} + +void ReadBufferFromNexusFS::resetPrefetch() +{ + if (!prefetch_future.valid()) + return; + + auto bwh = prefetch_future.get(); + prefetch_future = {}; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedBytes, bwh.getSize()); + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSUnusedPrefetches); +} + +} diff --git a/src/IO/ReadBufferFromFileWithNexusFS.h b/src/IO/ReadBufferFromNexusFS.h similarity index 66% rename from src/IO/ReadBufferFromFileWithNexusFS.h rename to src/IO/ReadBufferFromNexusFS.h index 2c50b1c965e..18d3e5e20bb 100644 --- a/src/IO/ReadBufferFromFileWithNexusFS.h +++ b/src/IO/ReadBufferFromNexusFS.h @@ -4,25 +4,30 @@ #include "IO/SeekableReadBuffer.h" #include +#include +#include namespace DB { -class ReadBufferFromFileWithNexusFS : public ReadBufferFromFileBase +class ReadBufferFromNexusFS : public ReadBufferFromFileBase { public: - explicit ReadBufferFromFileWithNexusFS( + explicit ReadBufferFromNexusFS( size_t buf_size, + bool actively_prefetch, std::unique_ptr source_read_buffer, NexusFS &nexus_fs); - ~ReadBufferFromFileWithNexusFS() override = default; + ~ReadBufferFromNexusFS() override; bool nextImpl() override; off_t seek(off_t off, int whence) override; + void prefetch(Priority priority) override; + IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore) override; size_t readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) override; @@ -38,14 +43,26 @@ class ReadBufferFromFileWithNexusFS : public ReadBufferFromFileBase bool isSeekCheap() override { return false; } private: - LoggerPtr log = getLogger("ReadBufferFromFileWithNexusFS"); + + bool hasPendingDataToRead(); + + void resetPrefetch(); + + LoggerPtr log = getLogger("ReadBufferFromNexusFS"); const String file_name; std::unique_ptr source_read_buffer; NexusFS &nexus_fs; + const size_t buf_size = 0; off_t offset = 0; off_t read_until_position = 0; + + bool read_to_internal_buffer = false; + NexusFSBufferWithHandle nexusfs_buffer; + + const bool actively_prefetch = false; + std::future prefetch_future; }; } diff --git a/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp b/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp index 085ffb866b9..29324ab2ddc 100644 --- a/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp +++ b/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -32,6 +32,8 @@ class ReadIndirectBuffer final : public ReadBufferFromFileBase off_t getPosition() override { return pos - working_buffer.begin(); } + size_t getFileSize() override { return working_buffer.size(); } + off_t seek(off_t off, int whence) override { impl.swap(*this); @@ -58,15 +60,24 @@ class ReadIndirectBuffer final : public ReadBufferFromFileBase const String path; }; -TEST(ReadBufferFromFileWithNexusFSTest, Read) +TEST(ReadBufferFromNexusFSTest, Read) { + const UInt32 segment_size = 128; AutoPtr conf(new MapConfiguration()); conf->setBool("nexus_fs.use_memory_device", true); - conf->setUInt64("nexus_fs.cache_size", 64 * MiB); + conf->setUInt64("nexus_fs.cache_size", 512 * 10); conf->setUInt64("nexus_fs.region_size", 512); - conf->setUInt64("nexus_fs.segment_size", 128); + conf->setUInt64("nexus_fs.segment_size", segment_size); conf->setUInt("nexus_fs.alloc_align_size", 32); conf->setUInt("nexus_fs.io_align_size", 32); + conf->setUInt("nexus_fs.clean_regions_pool", 3); + conf->setUInt("nexus_fs.clean_region_threads", 2); + conf->setUInt("nexus_fs.num_in_mem_buffers", 6); + conf->setBool("nexus_fs.enable_memory_buffer", true); + conf->setUInt("nexus_fs.reader_threads", 8); + conf->setUInt64("nexus_fs.memory_buffer_size", 128 * 6); + conf->setDouble("nexus_fs.memory_buffer_cooling_percent", 0.4); + conf->setDouble("nexus_fs.memory_buffer_freed_percent", 0.2); NexusFSConfig nexusfs_conf; nexusfs_conf.loadFromConfig(*conf); @@ -79,38 +90,47 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) String large_data; for (int i = 0; i < large_len; i++) large_data.push_back(i % 26 + 'a'); + for (int i = 0; i < large_len; i += segment_size) + { + auto s = fmt::format("seg#{}", i / segment_size); + for (int j = 0; j < s.size(); j++) + { + if (i + j < large_len) + large_data[i + j] = s[j]; + } + } // small read { auto source = std::make_unique("file1", small_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[small_len + 5]; memset(buffer, 0, small_len + 5); auto bytes_read = read_buffer.readBig(buffer, small_len); ASSERT_EQ(bytes_read, small_len); - ASSERT_TRUE(strcmp(buffer, small_data.c_str()) == 0); + EXPECT_STREQ(buffer, small_data.c_str()); } // large read { auto source = std::make_unique("file2", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); char buffer[large_len + 5]; memset(buffer, 0, large_len + 5); auto bytes_read = read_buffer.readBig(buffer, large_len); ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); + EXPECT_STREQ(buffer, large_data.c_str()); } // with seek { constexpr int off = 200; auto source = std::make_unique("file3", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); char buffer[large_len - off + 5]; memset(buffer, 0, large_len - off + 5); @@ -118,7 +138,21 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.readBig(buffer, large_len - off); ASSERT_EQ(bytes_read, large_len - off); - ASSERT_TRUE(strcmp(buffer, large_data.substr(off).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(off).c_str()); + } + + // read nexus_fs disk cache + { + String data; + auto fake_source = std::make_unique("file2", data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(fake_source), *nexus_fs); + + char buffer[large_len + 5]; + memset(buffer, 0, large_len + 5); + auto bytes_read = read_buffer.readBig(buffer, large_len); + + ASSERT_EQ(bytes_read, large_len); + EXPECT_STREQ(buffer, large_data.c_str()); } // multi thread @@ -128,14 +162,35 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) for (int i = 0; i < n; i++) threads[i] = std::thread([&](){ auto source = std::make_unique("file4", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); + + char buffer[large_len + 5]; + memset(buffer, 0, large_len + 5); + auto bytes_read = read_buffer.readBig(buffer, large_len); + + ASSERT_EQ(bytes_read, large_len); + EXPECT_STREQ(buffer, large_data.c_str()); + }); + + for (int i = 0; i < n; i++) + threads[i].join(); + } + + // multi thread, non-aligned buffer size + { + constexpr int n = 20; + std::vector threads(n); + for (int i = 0; i < n; i++) + threads[i] = std::thread([&](){ + auto source = std::make_unique("file5", large_data); + ReadBufferFromNexusFS read_buffer(93, true, std::move(source), *nexus_fs); char buffer[large_len + 5]; memset(buffer, 0, large_len + 5); auto bytes_read = read_buffer.readBig(buffer, large_len); ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); + EXPECT_STREQ(buffer, large_data.c_str()); }); for (int i = 0; i < n; i++) @@ -150,8 +205,8 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) std::vector threads(n); for (int i = 0; i < n; i++) threads[i] = std::thread([&](){ - auto source = std::make_unique("file5", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + auto source = std::make_unique("file6", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); std::default_random_engine local_generator; local_generator.seed(i); @@ -165,7 +220,7 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, local_buffer_size); ASSERT_EQ(bytes_read, local_buffer_size); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset, local_buffer_size).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset, local_buffer_size).c_str()); } }); @@ -173,27 +228,12 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) threads[i].join(); } - // read nexus_fs disk cache - { - String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); - - char buffer[large_len + 5]; - memset(buffer, 0, large_len + 5); - auto bytes_read = read_buffer.readBig(buffer, large_len); - - ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); - } - // read until pos { constexpr int until_pos = 678; constexpr int offset = 123; - String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + auto source = std::make_unique("file2", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[until_pos - offset + 5]; memset(buffer, 0, until_pos - offset + 5); @@ -202,15 +242,14 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, large_len); ASSERT_EQ(bytes_read, until_pos - offset); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset, bytes_read).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset, bytes_read).c_str()); } // read until end { - constexpr int offset = 256; - String data; - auto fake_source = std::make_unique("file3", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + constexpr int offset = 200; + auto source = std::make_unique("file5", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[large_len - offset + 5]; memset(buffer, 0, large_len - offset + 5); @@ -219,15 +258,15 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, large_len); ASSERT_EQ(bytes_read, large_len - offset); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset).c_str()); } // readInto { - constexpr int off = 200; + constexpr int off = 256; String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + auto fake_source = std::make_unique("file5", data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(fake_source), *nexus_fs); char buffer[large_len - off + 5]; memset(buffer, 0, large_len - off + 5); @@ -243,7 +282,7 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) } ASSERT_EQ(bytes_read, large_len - off); - ASSERT_TRUE(strcmp(buffer, large_data.substr(off).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(off).c_str()); } conf.reset(); diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index c13cd2d503c..5c316e41d2b 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -419,7 +419,20 @@ SetPtr makeExplicitSet( const auto *const arg_type_ptr = typeid_cast(left_arg_type.get()); if (arg_type_ptr) + { left_arg_type = arg_type_ptr->getNestedType(); + if (left_arg_type->getTypeId() == TypeIndex::Nothing) // Just make an empty set for empty array like arraySetCheck([], (1,2)) + { + DataTypes set_element_types = {left_arg_type}; + auto set_key = PreparedSetKey::forLiteral(*right_arg, set_element_types); + if (prepared_sets.count(set_key)) + return prepared_sets.at(set_key); /// Already prepared. + Block block; + SetPtr empty_set = std::make_shared(size_limits, create_ordered_set, context->getSettingsRef().transform_null_in); + prepared_sets[set_key] = empty_set; + return empty_set; + } + } else throw Exception("Invalid argument of function arraySet related functions", ErrorCodes::LOGICAL_ERROR); } diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 90963a6ffdb..00683ce470d 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -723,6 +724,15 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } + { + if (auto nexus_fs = getContext()->getNexusFS()) + { + new_values["NexusFSNumSegments"] = nexus_fs->getNumSegments(); + new_values["NexusFSNumFiles"] = nexus_fs->getNumFileMetas(); + new_values["NexusFSNumInodes"] = nexus_fs->getNumInodes(); + } + } + if (auto gin_store_reader_factory = getContext()->getGINStoreReaderFactory()) { new_values["GINReaderFactoryCacheBytes"] = gin_store_reader_factory->residentMemory(); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 252e0859600..330cfe71a2b 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -6176,7 +6176,14 @@ UInt32 Context::getQueryMaxExecutionTime() const return 100 * 60 * 1000; // default as 100min } -void Context::setQueryExpirationTimeStamp() +timespec Context::getQueryExpirationTimeStamp() const +{ + if (!query_expiration_timestamp) + throw Exception("query_expiration_timestamp has not set.", ErrorCodes::LOGICAL_ERROR); + return query_expiration_timestamp.value(); +} + +void Context::initQueryExpirationTimeStamp() { auto initial_query_start_time_ms = client_info.initial_query_start_time_microseconds / 1000; // Internal queries are those executed without an independent client context, diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 9fbb548f6eb..f889d329fc3 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -594,7 +594,7 @@ class ContextData QueueThrottlerDeleterPtr queue_throttler_ptr; bool enable_worker_fault_tolerance = false; - timespec query_expiration_timestamp{}; + std::optional query_expiration_timestamp; public: // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context. @@ -1762,8 +1762,8 @@ class Context : public ContextData, public std::enable_shared_from_this void removeRunningBackupTask(const String & backup_id); UInt32 getQueryMaxExecutionTime() const; - timespec getQueryExpirationTimeStamp() const { return query_expiration_timestamp; } - void setQueryExpirationTimeStamp(); + timespec getQueryExpirationTimeStamp() const; + void initQueryExpirationTimeStamp(); AsynchronousReaderPtr getThreadPoolReader() const; #if USE_LIBURING diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index e4a3292def7..ddd87affb1a 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -360,32 +361,15 @@ DatabaseAndTable DatabaseCatalog::getTableImpl( auto it = databases.find(tenant_db); if (databases.end() == it) { - if (context_->getSettingsRef().fallback_use_cnch_catalog.value == String(ALL_TABLE_FALLBACK_CNCH_CATALOG) - || startsWith(table_id.table_name, context_->getSettingsRef().fallback_use_cnch_catalog.value)) - { - LOG_WARNING( - log, - "Database {} not found in local, fallback to request cnch catalog when fetching {}", + if (exception) + exception->emplace( + ErrorCodes::UNKNOWN_DATABASE, + "Database {} doesn't exist when fetching {}", backQuoteIfNeed(table_id.getDatabaseName()), table_id.getNameForLogs()); - database = tryGetDatabaseCnch(table_id.getDatabaseName(), context_); - } - - if (!database) - { - if (exception) - exception->emplace( - ErrorCodes::UNKNOWN_DATABASE, - "Database {} doesn't exist when fetching {}", - backQuoteIfNeed(table_id.getDatabaseName()), - table_id.getNameForLogs()); - return {}; - } - } - else - { - database = it->second; + return {}; } + database = it->second; } StoragePtr table = database->tryGetTable(table_id.table_name, context_, true); diff --git a/src/Interpreters/DistributedStages/BSPScheduler.cpp b/src/Interpreters/DistributedStages/BSPScheduler.cpp index d6e6e2f6f96..5db824535a2 100644 --- a/src/Interpreters/DistributedStages/BSPScheduler.cpp +++ b/src/Interpreters/DistributedStages/BSPScheduler.cpp @@ -1,6 +1,7 @@ #include "BSPScheduler.h" #include +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include #include @@ -109,6 +111,14 @@ bool BSPScheduler::processEvent(const ScheduleEvent & event) handleSegmentInstanceFinishedEvent(event); break; } + case ScheduleEventType::SendResourceRequest: { + handleSendResourceRequestEvent(event); + break; + } + case ScheduleEventType::ResourceRequestGranted: { + handleResourceRequestGrantedEvent(event); + break; + } default: throw Exception(fmt::format("Unexpected event type {}", event.getType()), ErrorCodes::LOGICAL_ERROR); } @@ -593,13 +603,14 @@ void BSPScheduler::triggerDispatch(const std::vector & available_wor if (worker_node.address.getHostName().empty()) worker_node.address = address; } + WorkerId worker_id; + // todo (wangtao.vip) move this vars into scheduler. + if (!cluster_nodes.vw_name.empty() && !cluster_nodes.worker_group_id.empty()) + worker_id = WorkerStatusManager::getWorkerId(cluster_nodes.vw_name, cluster_nodes.worker_group_id, worker.id); { // TODO(wangtao.vip): this should be handled with dispatch failure. std::unique_lock lk(nodes_alloc_mutex); running_segment_to_workers[task_instance.segment_id].insert(address); - WorkerId worker_id; - if (!cluster_nodes.vw_name.empty() && !cluster_nodes.worker_group_id.empty()) - worker_id = WorkerStatusManager::getWorkerId(cluster_nodes.vw_name, cluster_nodes.worker_group_id, worker.id); auto instance_id = PlanSegmentInstanceId{ static_cast(task_instance.segment_id), static_cast(task_instance.parallel_index)}; running_instances->insert(address, worker_id, instance_id); @@ -608,7 +619,66 @@ void BSPScheduler::triggerDispatch(const std::vector & available_wor local_address); /// init with server addr, as we wont schedule to server } - dispatchOrCollectTask(dag_graph_ptr->getPlanSegmentPtr(task_instance.segment_id), task_instance); + if (query_context->getSettingsRef().enable_resource_aware_scheduler) + // todo (wangtao.vip): to be event driven. combine with retrying. + sendResourceRequest(task_instance, worker_id); + else + dispatchOrCollectTask(dag_graph_ptr->getPlanSegmentPtr(task_instance.segment_id), task_instance); + } + } +} + +void BSPScheduler::sendResourceRequest(const SegmentTaskInstance & instance, const WorkerId & worker_id) +{ + // TODO(lianxuchao): predicate the resource + ResourceRequest req{ + .segment_id = static_cast(instance.segment_id), + .parallel_index = static_cast(instance.parallel_index), + .worker_id = worker_id.ToString(), + .v_cpu = 1, + .epoch = 0}; + postEvent(std::make_shared(std::list{req})); +} + +void BSPScheduler::handleSendResourceRequestEvent(const ScheduleEvent & event) +{ + const auto & request_event = dynamic_cast(event); + if (auto rm_client = query_context->getResourceManagerClient(); rm_client) + { + for (const auto & request : request_event.resource_request) + { + pending_resource_requests.insert(SegmentTaskInstance{request.segment_id, request.parallel_index}, request); + rm_client->sendResourceRequest(fillResourceRequestToProto(request)); + } + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Resource manager is needed in resource aware scheduler"); + } +} + +void BSPScheduler::handleResourceRequestGrantedEvent(const ScheduleEvent & event) +{ + const auto & grant = dynamic_cast(event); + const auto instance = SegmentTaskInstance{grant.segment_id, grant.parallel_index}; + if (pending_resource_requests.illegal(instance, grant.epoch)) + { + if (grant.ok) + { + if (pending_resource_requests.pending_requests.contains(instance)) + { + pending_resource_requests.erase(instance); + dispatchOrCollectTask(dag_graph_ptr->getPlanSegmentPtr(grant.segment_id), instance); + } + else + { + LOG_WARNING( + log, "Instance {}_{} is not in pending set, may be duplicated grant.", instance.segment_id, instance.parallel_index); + } + } + else + { + // todo (wangtao.vip) re-request after an interval, say 10 seconds. } } } @@ -772,4 +842,26 @@ bool BSPScheduler::isTaintNode(size_t task_id, const AddressInfo & worker, Taint throw Exception("Unexpected taint level", ErrorCodes::LOGICAL_ERROR); } } + +Protos::SendResourceRequestReq BSPScheduler::fillResourceRequestToProto(const ResourceRequest & req) +{ + Protos::SendResourceRequestReq pb; + local_address.toProto(*pb.mutable_server_addr()); + pb.set_req_type(::DB::Protos::ResourceRequestType::RESOURCE_REQUEST); + pb.set_query_id(query_id); + pb.set_query_start_ts(query_context->getClientInfo().initial_query_start_time_microseconds / 1000); + pb.set_segment_id(req.segment_id); + pb.set_parallel_index(req.parallel_index); + pb.set_worker_id(req.worker_id); + pb.set_request_vcpu(req.v_cpu); + pb.set_request_mem(req.mem); + pb.set_epoch(req.epoch); + + return pb; +} + +void BSPScheduler::resourceRequestGranted(const UInt32 segment_id, const UInt32 parallel_index, const UInt32 epoch, bool ok) +{ + postEvent(std::make_shared(segment_id, parallel_index, epoch, ok)); +} } diff --git a/src/Interpreters/DistributedStages/BSPScheduler.h b/src/Interpreters/DistributedStages/BSPScheduler.h index 9c6c941cf2a..4f9c5bdf4d5 100644 --- a/src/Interpreters/DistributedStages/BSPScheduler.h +++ b/src/Interpreters/DistributedStages/BSPScheduler.h @@ -1,20 +1,25 @@ +#pragma once + #include #include +#include #include #include #include #include +#include #include #include +#include #include #include #include #include +#include +#include +#include #include #include -#include "Catalog/Catalog.h" -#include "Interpreters/NodeSelector.h" -#include "Interpreters/WorkerStatusManager.h" namespace DB { @@ -58,6 +63,88 @@ class BSPScheduler : public Scheduler std::unordered_set no_prefs; }; + // ATTENTION: NOT thread safe. Make it used in main event processing. + struct PendingResourceRequests + { + std::unordered_map pending_requests; + std::unordered_map request_time; + + void insert(const SegmentTaskInstance & instance, const ResourceRequest & req) + { + pending_requests[instance] = req; + request_time[instance] = time(nullptr); + } + void erase(const SegmentTaskInstance & instance) + { + pending_requests.erase(instance); + request_time.erase(instance); + } + bool illegal(const SegmentTaskInstance & instance, UInt32 epoch) + { + return pending_requests.contains(instance) && pending_requests[instance].epoch == epoch; + } + std::list getOutdatedRequests() + { + std::list ret; + UInt32 now = time(nullptr); + for (auto iter = request_time.begin(); iter != request_time.end();) + { + if (now - iter->second > timeout_ms) + { + pending_requests[iter->first].epoch += 1; + ret.push_back(std::move(pending_requests[iter->first])); + iter = request_time.erase(iter); + } + else + iter++; + } + return ret; + } + + UInt32 timeout_ms; // todo (wangtao.vip) init this + }; + + struct RunningInstances + { + explicit RunningInstances(std::shared_ptr worker_status_manager_) + : worker_status_manager(worker_status_manager_) + { + } + void insert(const AddressInfo & address, const WorkerId & worker_id, const PlanSegmentInstanceId & instance_id) + { + worker_to_running_instances[address].insert(instance_id); + std::optional status = worker_status_manager->getWorkerStatus(worker_id); + running_instance_epoch[instance_id] = status.has_value() ? status->worker_status->register_time : 0; + } + void erase(const AddressInfo & address, const PlanSegmentInstanceId & instance_id) + { + worker_to_running_instances[address].erase(instance_id); + running_instance_epoch.erase(instance_id); + } + std::unordered_set getInstances(const AddressInfo & address, UInt32 register_time) + { + std::unordered_set ret; + for (const auto & id : worker_to_running_instances[address]) + { + if (running_instance_epoch[id] == register_time) + ret.insert(id); + } + return ret; + } + UInt32 getEpoch(const PlanSegmentInstanceId & instance_id) + { + if (const auto & iter = running_instance_epoch.find(instance_id); iter != running_instance_epoch.end()) + { + return iter->second; + } + return static_cast(0); + } + // nodes -> instances running on it, used to retry them when worker restarted. + std::unordered_map, AddressInfo::Hash> worker_to_running_instances; + std::unordered_map running_instance_epoch; + std::shared_ptr worker_status_manager; + }; + public: BSPScheduler(const String & query_id_, ContextPtr query_context_, std::shared_ptr dag_graph_ptr_) : BSPScheduler( @@ -106,6 +193,7 @@ class BSPScheduler : public Scheduler SegmentInstanceStatusUpdateResult segmentInstanceFinished(size_t segment_id, UInt64 parallel_index, const RuntimeSegmentStatus & status); void workerRestarted(const WorkerId & id, const HostWithPorts & host_ports, UInt32 register_time); + void resourceRequestGranted(const UInt32 segment_id, const UInt32 parallel_index, const UInt32 epoch, bool ok); bool getEventToProcess(std::shared_ptr & event); bool processEvent(const ScheduleEvent & event); bool hasEvent() const; @@ -113,6 +201,7 @@ class BSPScheduler : public Scheduler { return cluster_nodes; } + size_t getAttemptId(const PlanSegmentInstanceId & instance_id) const { return segment_instance_attempts[instance_id]; @@ -126,47 +215,6 @@ class BSPScheduler : public Scheduler bool addBatchTask(BatchTaskPtr batch_task) override; private: - struct RunningInstances - { - explicit RunningInstances(std::shared_ptr worker_status_manager_) - : worker_status_manager(worker_status_manager_) - { - } - void insert(const AddressInfo & address, const WorkerId & worker_id, const PlanSegmentInstanceId & instance_id) - { - worker_to_running_instances[address].insert(instance_id); - std::optional status = worker_status_manager->getWorkerStatus(worker_id); - running_instance_epoch[instance_id] = status.has_value() ? status->worker_status->register_time : 0; - } - void erase(const AddressInfo & address, const PlanSegmentInstanceId & instance_id) - { - worker_to_running_instances[address].erase(instance_id); - running_instance_epoch.erase(instance_id); - } - std::unordered_set getInstances(const AddressInfo & address, UInt32 register_time) - { - std::unordered_set ret; - for (const auto & id : worker_to_running_instances[address]) - { - if (running_instance_epoch[id] == register_time) - ret.insert(id); - } - return ret; - } - UInt32 getEpoch(const PlanSegmentInstanceId & instance_id) - { - if (const auto & iter = running_instance_epoch.find(instance_id); iter != running_instance_epoch.end()) - { - return iter->second; - } - return static_cast(0); - } - // nodes -> instances running on it, used to retry them when worker restarted. - std::unordered_map, AddressInfo::Hash> worker_to_running_instances; - std::unordered_map running_instance_epoch; - std::shared_ptr worker_status_manager; - }; - bool postEvent(std::shared_ptr event); bool postHighPriorityEvent(std::shared_ptr event); std::pair getInstanceToSchedule(const AddressInfo & worker); @@ -177,6 +225,7 @@ class BSPScheduler : public Scheduler PlanSegmentExecutionInfo generateExecutionInfo(size_t task_id, size_t index) override; void submitTasks(PlanSegment * plan_segment_ptr, const SegmentTask & task) override; void prepareFinalTaskImpl(PlanSegment * final_plan_segment, const AddressInfo & addr) override; + void sendResourceRequest(const SegmentTaskInstance & instance, const WorkerId & worker_id); bool isUnrecoverableStatus(const RuntimeSegmentStatus & status); bool isOutdated(const RuntimeSegmentStatus & status); @@ -186,6 +235,8 @@ class BSPScheduler : public Scheduler void handleTriggerDispatchEvent(const ScheduleEvent& event); void handleWorkerRestartedEvent(const ScheduleEvent& event); void handleSegmentInstanceFinishedEvent(const ScheduleEvent& event); + void handleSendResourceRequestEvent(const ScheduleEvent & event); + void handleResourceRequestGrantedEvent(const ScheduleEvent & event); SegmentInstanceStatusUpdateResult onSegmentInstanceFinished(size_t segment_id, UInt64 parallel_index, const RuntimeSegmentStatus & status); @@ -193,6 +244,8 @@ class BSPScheduler : public Scheduler bool retryTaskIfPossible(size_t segment_id, UInt64 parallel_index, const RuntimeSegmentStatus & status); void resendResource(const HostWithPorts & host_ports); + Protos::SendResourceRequestReq fillResourceRequestToProto(const ResourceRequest & req); + // All batch task will be enqueue first. The schedule logic will pop queue and schedule the poped tasks. EventQueue queue{10000}; // Special queue for events of highest priority, like abort/resend resource. @@ -222,6 +275,9 @@ class BSPScheduler : public Scheduler std::unordered_map, SegmentTaskInstance::Hash> source_task_idx; std::unordered_map, SegmentTaskInstance::Hash> source_task_buckets; + // ATTENTION: NOT thread safe. Make it used in main event processing. + PendingResourceRequests pending_resource_requests; + /// Error reasons which can not be recovered by retry. We need quit right now. static std::unordered_set unrecoverable_reasons; size_t query_unique_id = {}; diff --git a/src/Interpreters/DistributedStages/MPPQueryCoordinator.cpp b/src/Interpreters/DistributedStages/MPPQueryCoordinator.cpp index 2808ebcfb44..1a0c8de907f 100644 --- a/src/Interpreters/DistributedStages/MPPQueryCoordinator.cpp +++ b/src/Interpreters/DistributedStages/MPPQueryCoordinator.cpp @@ -222,8 +222,13 @@ BlockIO MPPQueryCoordinator::execute() if (scheduler_status && !scheduler_status->exception.empty()) { - throw Exception( - "Query failed before final task execution, error message: " + scheduler_status->exception, scheduler_status->error_code); + const auto error_msg = "Query failed before final task execution, error message:" + std::move(scheduler_status->exception); + if (isAmbiguosError(scheduler_status->error_code)) + { + auto status = waitUntilFinish(scheduler_status->error_code, error_msg); + throw Exception(status.summarized_error_msg, status.error_code); + } + throw Exception(error_msg, scheduler_status->error_code); } if (!scheduler_status || !scheduler_status->is_final_stage_start) diff --git a/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.cpp b/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.cpp index f6a987f7fb7..1d551a12937 100644 --- a/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.cpp +++ b/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.cpp @@ -319,7 +319,7 @@ void PlanSegmentManagerRpcService::prepareCommonParams( UInt32 query_settings_buf_size, brpc::Controller * cntl, std::shared_ptr & query_common, - std::shared_ptr & settings_io_buf) + std::shared_ptr & settings_changes) { if (major_revision != DBMS_BRPC_PROTOCOL_MAJOR_VERSION) throw Exception( @@ -355,16 +355,27 @@ void PlanSegmentManagerRpcService::prepareCommonParams( ErrorCodes::LOGICAL_ERROR); } } - settings_io_buf = std::make_shared(settings_common_buf.movable()); + auto settings_io_buf = std::make_shared(settings_common_buf.movable()); + if (!settings_io_buf->empty()) + { + /// apply settings changed + ReadBufferFromBrpcBuf settings_read_buf(*settings_io_buf); + Settings settings; + const size_t MIN_MINOR_VERSION_ENABLE_STRINGS_WITH_FLAGS = 4; + if (query_common->brpc_protocol_minor_revision() >= MIN_MINOR_VERSION_ENABLE_STRINGS_WITH_FLAGS) + settings.read(settings_read_buf, SettingsWriteFormat::STRINGS_WITH_FLAGS); + else + settings.read(settings_read_buf, SettingsWriteFormat::BINARY); + auto changes = settings.changes(); + settings_changes = std::make_shared(changes); + } } ContextMutablePtr PlanSegmentManagerRpcService::createQueryContext( ContextMutablePtr global_context, std::shared_ptr & query_common, - std::shared_ptr & settings_io_buf, UInt16 remote_side_port, - PlanSegmentInstanceId instance_id, - const AddressInfo & execution_address) + PlanSegmentInstanceId instance_id) { /// Create context. ContextMutablePtr query_context; @@ -420,22 +431,28 @@ ContextMutablePtr PlanSegmentManagerRpcService::createQueryContext( client_info.current_address = std::move(current_socket_address); client_info.rpc_port = query_common->coordinator_address().exchange_port(); + if (query_common->has_parent_query_id()) + { + client_info.parent_initial_query_id = query_common->parent_query_id(); + query_context->setInternalQuery(query_common->is_internal_query()); + } + + return query_context; +} + +void PlanSegmentManagerRpcService::initQueryContext( + ContextMutablePtr query_context, + std::shared_ptr query_common, + std::shared_ptr settings_changes, + const AddressInfo & execution_address) +{ /// Authentication const auto & current_user = execution_address.getUser(); query_context->setUser(current_user, execution_address.getPassword(), query_context->getClientInfo().current_address); - if (!settings_io_buf->empty()) - { - ReadBufferFromBrpcBuf settings_read_buf(*settings_io_buf); - /// Sets an extra row policy based on `client_info.initial_user` - // query_context->setInitialRowPolicy(); - /// apply settings changed - const size_t MIN_MINOR_VERSION_ENABLE_STRINGS_WITH_FLAGS = 4; - if (query_common->brpc_protocol_minor_revision() >= MIN_MINOR_VERSION_ENABLE_STRINGS_WITH_FLAGS) - const_cast(query_context->getSettingsRef()).read(settings_read_buf, SettingsWriteFormat::STRINGS_WITH_FLAGS); - else - const_cast(query_context->getSettingsRef()).read(settings_read_buf, SettingsWriteFormat::BINARY); - } + /// apply settings changed, must after setUser + if (settings_changes) + query_context->applySettingsChanges(*settings_changes); /// Disable function name normalization when it's a secondary query, because queries are either /// already normalized on initiator node, or not normalized and should remain unnormalized for @@ -452,20 +469,12 @@ ContextMutablePtr PlanSegmentManagerRpcService::createQueryContext( if (!query_context->hasQueryContext()) query_context->makeQueryContext(); - query_context->setQueryExpirationTimeStamp(); - - if (query_common->has_parent_query_id()) - { - client_info.parent_initial_query_id = query_common->parent_query_id(); - query_context->setInternalQuery(query_common->is_internal_query()); - } - - return query_context; + query_context->initQueryExpirationTimeStamp(); } void PlanSegmentManagerRpcService::executePlanSegment( std::shared_ptr query_common, - std::shared_ptr settings_io_buf, + std::shared_ptr settings_changes, UInt16 remote_side_port, UInt32 segment_id, PlanSegmentExecutionInfo & execution_info, @@ -477,7 +486,7 @@ void PlanSegmentManagerRpcService::executePlanSegment( ThreadFromGlobalPool async_thread([global_context = context, query_common = std::move(query_common), - settings_io_buf = std::move(settings_io_buf), + settings_changes = std::move(settings_changes), remote_side_port = remote_side_port, segment_id = segment_id, execution_info = std::move(execution_info), @@ -488,13 +497,9 @@ void PlanSegmentManagerRpcService::executePlanSegment( try { if (!query_context) - query_context = createQueryContext( - global_context, - query_common, - settings_io_buf, - remote_side_port, - {segment_id, execution_info.parallel_id}, - execution_info.execution_address); + query_context = createQueryContext(global_context, query_common, remote_side_port, {segment_id, execution_info.parallel_id}); + + initQueryContext(query_context, query_common, settings_changes, execution_info.execution_address); if (!process_plan_segment_entry) process_plan_segment_entry = query_context->getPlanSegmentProcessList().insertGroup(query_context, segment_id); @@ -523,12 +528,12 @@ void PlanSegmentManagerRpcService::executePlanSegment( catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__, query_context ? query_context->getCurrentQueryId() : ""); - if (before_execute) + if (before_execute && query_context) { int exception_code = getCurrentExceptionCode(); auto exception_message = getCurrentExceptionMessage(false); - auto result = convertFailurePlanSegmentStatusToResult(query_context, execution_info, exception_code, exception_message); + auto result = convertFailurePlanSegmentStatusToResult(std::move(query_context), execution_info, exception_code, exception_message); reportExecutionResult(result); } } @@ -554,14 +559,14 @@ void PlanSegmentManagerRpcService::submitPlanSegment( context->getEpoch()); auto query_common = std::make_shared(); - std::shared_ptr settings_io_buf; + std::shared_ptr settings_changes; prepareCommonParams( request->brpc_protocol_major_revision(), request->query_common_buf_size(), request->query_settings_buf_size(), cntl, query_common, - settings_io_buf); + settings_changes); PlanSegmentExecutionInfo execution_info; execution_info.parallel_id = request->parallel_id(); @@ -601,7 +606,7 @@ void PlanSegmentManagerRpcService::submitPlanSegment( executePlanSegment( std::move(query_common), - std::move(settings_io_buf), + std::move(settings_changes), cntl->remote_side().port, request->plan_segment_id(), execution_info, @@ -629,14 +634,14 @@ void PlanSegmentManagerRpcService::submitPlanSegments( try { auto query_common = std::make_shared(); - std::shared_ptr settings_io_buf; + std::shared_ptr settings_changes; prepareCommonParams( request->brpc_protocol_major_revision(), request->query_common_buf_size(), request->query_settings_buf_size(), cntl, query_common, - settings_io_buf); + settings_changes); // prepare segmentGroup std::vector segment_ids; @@ -650,7 +655,7 @@ void PlanSegmentManagerRpcService::submitPlanSegments( const auto execution_address = AddressInfo(request->execution_address()); auto first_query_context - = createQueryContext(context, query_common, settings_io_buf, cntl->remote_side().port, *first_instance_id, execution_address); + = createQueryContext(context, query_common, cntl->remote_side().port, *first_instance_id); auto process_plan_segment_entries = first_query_context->getPlanSegmentProcessList().insertGroup(first_query_context, segment_ids); const auto & headers = request->plan_segment_headers(); @@ -678,7 +683,7 @@ void PlanSegmentManagerRpcService::submitPlanSegments( executePlanSegment( query_common, - settings_io_buf, + settings_changes, cntl->remote_side().port, header.plan_segment_id(), execution_info, @@ -709,4 +714,40 @@ void PlanSegmentManagerRpcService::sendPlanSegmentProfile( const SegmentSchedulerPtr & scheduler = context->getSegmentScheduler(); scheduler->updateSegmentProfile(profile); } + +void PlanSegmentManagerRpcService::grantResourceRequest( + ::google::protobuf::RpcController * controller, + const ::DB::Protos::GrantResourceRequestReq * request, + ::DB::Protos::GrantResourceRequestResp * /*response*/, + ::google::protobuf::Closure * done) +{ + brpc::ClosureGuard done_guard(done); + brpc::Controller * cntl = static_cast(controller); + LOG_DEBUG( + log, + "Resource request({} {}_{}_{}) granted, result: {}", + request->req_type(), + request->query_id(), + request->segment_id(), + request->parallel_index(), + request->ok()); + + try + { + SegmentSchedulerPtr scheduler = context->getSegmentScheduler(); + auto bsp_scheduler = scheduler->getBSPScheduler(request->query_id()); + if (bsp_scheduler) + { + // todo (wangtao.vip): add not ok handling + // todo (wangtao.vip): use query start ms + bsp_scheduler->resourceRequestGranted(request->segment_id(), request->parallel_index(), request->epoch(), request->ok()); + } + } + catch (...) + { + auto error_msg = getCurrentExceptionMessage(false); + cntl->SetFailed(error_msg); + LOG_ERROR(log, "grantResourceRequest failed: {}", error_msg); + } +} } diff --git a/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.h b/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.h index 324fda38ff7..96776a996f1 100644 --- a/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.h +++ b/src/Interpreters/DistributedStages/PlanSegmentManagerRpcService.h @@ -139,6 +139,12 @@ class PlanSegmentManagerRpcService : public Protos::PlanSegmentManagerService ::DB::Protos::PlanSegmentProfileResponse * /*response*/, ::google::protobuf::Closure * done) override; + void grantResourceRequest( + ::google::protobuf::RpcController * controller, + const ::DB::Protos::GrantResourceRequestReq * request, + ::DB::Protos::GrantResourceRequestResp * response, + ::google::protobuf::Closure * done) override; + private: void prepareCommonParams( UInt32 major_revision, @@ -146,19 +152,25 @@ class PlanSegmentManagerRpcService : public Protos::PlanSegmentManagerService UInt32 query_settings_buf_size, brpc::Controller * cntl, std::shared_ptr & query_common, - std::shared_ptr & settings_io_buf); + std::shared_ptr & settings_changes); + // can be call both sync or async static ContextMutablePtr createQueryContext( ContextMutablePtr global_context, std::shared_ptr & query_common, - std::shared_ptr & settings_io_buf, UInt16 remote_side_port, - PlanSegmentInstanceId instance_id, + PlanSegmentInstanceId instance_id); + + // only can be call in async thread + static void initQueryContext( + ContextMutablePtr query_context, + std::shared_ptr query_common, + std::shared_ptr settings_changes, const AddressInfo & execution_address); void executePlanSegment( std::shared_ptr query_common, - std::shared_ptr settings_io_buf, + std::shared_ptr settings_changes, UInt16 remote_side_port, UInt32 segment_id, PlanSegmentExecutionInfo & execution_info, diff --git a/src/Interpreters/DistributedStages/PlanSegmentProcessList.h b/src/Interpreters/DistributedStages/PlanSegmentProcessList.h index 74f3a457d73..ad6ebfef404 100644 --- a/src/Interpreters/DistributedStages/PlanSegmentProcessList.h +++ b/src/Interpreters/DistributedStages/PlanSegmentProcessList.h @@ -87,9 +87,6 @@ class PlanSegmentGroup bool emplace_null(std::vector segment_ids) { std::unique_lock lock(mutex); - // for batch mode - if (segment_ids.size() != 1 && !segment_queries.empty()) - return false; for (const auto segment_id : segment_ids) { diff --git a/src/Interpreters/DistributedStages/ResourceRequest.h b/src/Interpreters/DistributedStages/ResourceRequest.h new file mode 100644 index 00000000000..4546c25f3dc --- /dev/null +++ b/src/Interpreters/DistributedStages/ResourceRequest.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +namespace DB +{ + +struct ResourceRequest +{ + UInt32 segment_id; + UInt32 parallel_index; + String worker_id; + UInt32 v_cpu{1}; + UInt32 mem{0}; + UInt32 epoch{0}; +}; + +} // namespace DB diff --git a/src/Interpreters/DistributedStages/ScheduleEvent.h b/src/Interpreters/DistributedStages/ScheduleEvent.h index b0b61a31a5b..b9a6308a26c 100644 --- a/src/Interpreters/DistributedStages/ScheduleEvent.h +++ b/src/Interpreters/DistributedStages/ScheduleEvent.h @@ -1,12 +1,14 @@ #pragma once #include +#include #include #include #include namespace DB { + /// Indicates a plan segment. struct SegmentTask { @@ -39,7 +41,9 @@ enum class ScheduleEventType : uint8_t TriggerDispatch = 3, WorkerRestarted = 4, SegmentInstanceFinished = 5, - ResendResource = 6 + ResendResource = 6, + ResourceRequestGranted = 7, + SendResourceRequest = 8 }; struct ScheduleEvent @@ -149,4 +153,36 @@ struct ResendResourceEvent : ScheduleEvent const HostWithPorts host_ports; }; +struct ResourceRequestGrantedEvent : ScheduleEvent +{ + ResourceRequestGrantedEvent(const UInt32 segment_id_, const UInt32 parallel_index_, const UInt32 epoch_, const bool ok_) + : segment_id(segment_id_), parallel_index(parallel_index_), epoch(epoch_), ok(ok_) + { + } + + ScheduleEventType getType() const override + { + return ScheduleEventType::ResourceRequestGranted; + } + + const UInt32 segment_id; + const UInt32 parallel_index; + const UInt32 epoch; + const bool ok; +}; + +struct SendResourceRequestEvent : ScheduleEvent +{ + explicit SendResourceRequestEvent(std::list resource_request_) : resource_request(std::move(resource_request_)) + { + } + + ScheduleEventType getType() const override + { + return ScheduleEventType::SendResourceRequest; + } + + const std::list resource_request; +}; + } // namespace DB diff --git a/src/Interpreters/DistributedStages/executePlanSegment.cpp b/src/Interpreters/DistributedStages/executePlanSegment.cpp index bd3bf568bf4..f3462fe6e47 100644 --- a/src/Interpreters/DistributedStages/executePlanSegment.cpp +++ b/src/Interpreters/DistributedStages/executePlanSegment.cpp @@ -238,7 +238,7 @@ void executePlanSegmentRemotelyWithPreparedBuf( cntl->request_attachment().append(attachment.movable()); cntl->set_timeout_ms(context.getSettingsRef().send_plan_segment_timeout_ms.totalMilliseconds()); google::protobuf::Closure * done = brpc::NewCallback( - &OnSendPlanSegmentCallback, response, cntl, rpc_channel, context.getWorkerStatusManager(), async_context, worker_id); + &OnSendPlanSegmentCallback, response, cntl, std::move(rpc_channel), context.getWorkerStatusManager(), async_context, worker_id); async_context->addCallId(call_id); manager_stub.submitPlanSegment(cntl, &request, response, done); } @@ -283,7 +283,7 @@ void executePlanSegmentsRemotely( auto call_id = cntl->call_id(); cntl->request_attachment().append(attachment.movable()); google::protobuf::Closure * done = brpc::NewCallback( - &OnSendPlanSegmentCallback, response, cntl, rpc_channel, context.getWorkerStatusManager(), async_context, worker_id); + &OnSendPlanSegmentCallback, response, cntl, std::move(rpc_channel), context.getWorkerStatusManager(), async_context, worker_id); async_context->addCallId(call_id); manager_stub.submitPlanSegments(cntl, &request, response, done); } diff --git a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp index 00fa649294a..b4ef1e8afa7 100644 --- a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp +++ b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp @@ -97,6 +97,9 @@ struct NonGlobalTableData : public WithContext if (distributed_product_mode == DistributedProductMode::LOCAL) { /// Convert distributed table to corresponding remote table. + StorageDistributed * distributed = dynamic_cast(storage.get()); + if (!distributed) + return; std::string database; std::string table; diff --git a/src/Interpreters/ProcessorProfile.cpp b/src/Interpreters/ProcessorProfile.cpp index 813dd3da7c3..f3bf2879dfb 100644 --- a/src/Interpreters/ProcessorProfile.cpp +++ b/src/Interpreters/ProcessorProfile.cpp @@ -161,7 +161,9 @@ std::set GroupedProcessorProfile::fillChildren(Group auto roots = fillChildren(item.second, visited); for (const auto & root : roots) outputs.insert(root); + input_processor->parent_step_ids.emplace(item.second->step_id); } + input_processor->parents.clear(); return outputs; } @@ -374,7 +376,7 @@ GroupedProcessorProfile::getGroupedProfileFromMetrics(std::unordered_mapparents.emplace(node->processor_name, node); + // child->parents.emplace(node->processor_name, node); node->children.emplace_back(child); } } @@ -414,10 +416,10 @@ StepProfiles GroupedProcessorProfile::aggregateOperatorProfileToStepLevel(Groupe q.pop(); auto & current_step_id = processor_profile->step_id; auto & inputs = processor_profile->children; - auto & outputs = processor_profile->parents; + auto & outputs = processor_profile->parent_step_ids; if (current_step_id == -1 && !outputs.empty() && processor_profile->processor_name != "output_root") - current_step_id = outputs.begin()->second->step_id; + current_step_id = *outputs.begin(); step_processor_profiles_at_each_level[current_step_id].profiles_at_each_level[level].push_back(processor_profile); diff --git a/src/Interpreters/ProcessorProfile.h b/src/Interpreters/ProcessorProfile.h index 4133ef303f3..450882b5f5c 100644 --- a/src/Interpreters/ProcessorProfile.h +++ b/src/Interpreters/ProcessorProfile.h @@ -79,7 +79,8 @@ struct GroupedProcessorProfile std::unordered_set processor_ids; bool visited = false; - std::unordered_map parents; + std::unordered_map parents; // Be careful to avoid circular dependencies between parents and children + std::unordered_set parent_step_ids; std::vector children; UInt64 worker_cnt = 1; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 13db95306d9..92be5cebd6e 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1004,7 +1004,8 @@ static std::tuple executeQueryImpl( query_table = query_with_table_output->table; } - context->setQueryExpirationTimeStamp(); + context->initQueryExpirationTimeStamp(); + auto * insert_query = ast->as(); if (insert_query && insert_query->data) { diff --git a/src/Interpreters/sendPlanSegment.cpp b/src/Interpreters/sendPlanSegment.cpp index 925049f3a78..71698ab33ba 100644 --- a/src/Interpreters/sendPlanSegment.cpp +++ b/src/Interpreters/sendPlanSegment.cpp @@ -13,6 +13,18 @@ AddressInfo getLocalAddress(const Context & query_context) AddressInfo getRemoteAddress(HostWithPorts host_with_ports, ContextPtr & query_context) { + if(query_context->getSettingsRef().enable_internal_communication_user) + { + // Trick for avoiding RBAC performace loss + static auto [user, password] = query_context->getCnchInterserverCredentials(); + return AddressInfo( + host_with_ports.host, + host_with_ports.tcp_port, + user, + password, + host_with_ports.rpc_port); + } + const ClientInfo & info = query_context->getClientInfo(); return AddressInfo( host_with_ports.host, @@ -20,6 +32,7 @@ AddressInfo getRemoteAddress(HostWithPorts host_with_ports, ContextPtr & query_c info.current_user, info.current_password, host_with_ports.rpc_port); + } void sendPlanSegmentToAddress( diff --git a/src/Interpreters/tests/gtest_exchange_source_step.cpp b/src/Interpreters/tests/gtest_exchange_source_step.cpp index 82b34cf2f7e..7cdd700ce18 100644 --- a/src/Interpreters/tests/gtest_exchange_source_step.cpp +++ b/src/Interpreters/tests/gtest_exchange_source_step.cpp @@ -67,6 +67,8 @@ TEST(ExchangeSourceStep, InitializePipelineTest) auto coordinator_address_str = extractExchangeHostPort(coordinator_address); plan_segment.setCoordinatorAddress(coordinator_address); + context->setCoordinatorAddress(coordinator_address); + setQueryDuration(context); Block header = {ColumnWithTypeAndName(ColumnUInt8::create(), std::make_shared(), "local_exchange_test")}; PlanSegmentInputs inputs; diff --git a/src/Interpreters/tests/gtest_plan_segment_executor.cpp b/src/Interpreters/tests/gtest_plan_segment_executor.cpp index a840aa9b57f..c2d4d9c062a 100644 --- a/src/Interpreters/tests/gtest_plan_segment_executor.cpp +++ b/src/Interpreters/tests/gtest_plan_segment_executor.cpp @@ -135,6 +135,7 @@ TEST_F(PlanSegmentExecutorTest, ExecuteTest) context->getClientInfo().initial_query_id = plan_segment.getQueryId(); context->getClientInfo().current_query_id = plan_segment.getQueryId() + std::to_string(plan_segment.getPlanSegmentId()); context->setCoordinatorAddress(coordinator_address); + setQueryDuration(context); DataStream datastream{.header = header}; auto exchange_source_step = std::make_unique(inputs, datastream, false, false); @@ -245,6 +246,7 @@ TEST_F(PlanSegmentExecutorTest, ExecuteAsyncTest) context->getClientInfo().initial_query_id = plan_segment.getQueryId(); context->getClientInfo().current_query_id = plan_segment.getQueryId() + std::to_string(plan_segment.getPlanSegmentId()); context->setCoordinatorAddress(coordinator_address); + setQueryDuration(context); DataStream datastream{.header = header}; auto exchange_source_step = std::make_unique(inputs, datastream, false, false); @@ -358,6 +360,7 @@ TEST_F(PlanSegmentExecutorTest, ExecuteCancelTest) context->getClientInfo().initial_query_id = plan_segment.getQueryId(); context->getClientInfo().current_query_id = plan_segment.getQueryId() + std::to_string(plan_segment.getPlanSegmentId()); context->setCoordinatorAddress(coordinator_address); + setQueryDuration(context); DataStream datastream{.header = header}; auto exchange_source_step = std::make_unique(inputs, datastream, false, false); @@ -489,6 +492,7 @@ void planExecutor(String query_id, AddressInfo coordinator_address) context->getClientInfo().initial_query_id = plan_segment.getQueryId(); context->getClientInfo().current_query_id = plan_segment.getQueryId() + std::to_string(plan_segment.getPlanSegmentId()); context->setCoordinatorAddress(coordinator_address); + setQueryDuration(context); DataStream datastream{.header = header}; auto exchange_source_step = std::make_unique(inputs, datastream, false, false); diff --git a/src/Interpreters/tests/gtest_scheduler.cpp b/src/Interpreters/tests/gtest_scheduler.cpp index 47f6252dd41..9de22c4d245 100644 --- a/src/Interpreters/tests/gtest_scheduler.cpp +++ b/src/Interpreters/tests/gtest_scheduler.cpp @@ -77,7 +77,7 @@ SchedulerTestContext createSchedulerTestContext(size_t parallel_size, const std: result.query_context = getContext().createQueryContext("q1", settings); DB::TxnTimestamp t1 = 1; result.query_context->initCnchServerResource(t1); - result.query_context->setQueryExpirationTimeStamp(); + result.query_context->initQueryExpirationTimeStamp(); /// prepare plan segment result.segments = {std::make_shared(2, "q1", "c1"), std::make_shared(1, "q1", "c2")}; diff --git a/src/Optimizer/CardinalityEstimate/CardinalityEstimator.cpp b/src/Optimizer/CardinalityEstimate/CardinalityEstimator.cpp index 361dc5ba603..1587568cd72 100644 --- a/src/Optimizer/CardinalityEstimate/CardinalityEstimator.cpp +++ b/src/Optimizer/CardinalityEstimate/CardinalityEstimator.cpp @@ -258,7 +258,7 @@ PlanNodeStatisticsPtr CardinalityVisitor::visitReadNothingStep(const ReadNothing PlanNodeStatisticsPtr CardinalityVisitor::visitReadStorageRowCountStep(const ReadStorageRowCountStep &, CardinalityContext &) { - return std::make_shared(); + return std::make_shared(1); } PlanNodeStatisticsPtr CardinalityVisitor::visitValuesStep(const ValuesStep & step, CardinalityContext &) diff --git a/src/Optimizer/PlanOptimizer.cpp b/src/Optimizer/PlanOptimizer.cpp index ab798df2146..87af43b8b6e 100644 --- a/src/Optimizer/PlanOptimizer.cpp +++ b/src/Optimizer/PlanOptimizer.cpp @@ -482,7 +482,6 @@ const Rewriters & PlanOptimizer::getFullRewriters() std::make_shared(Rules::pushPartialStepRules(), "PushPartialStep"), std::make_shared(Rules::optimizeAggregateRules(), "OptimizeAggregate"), - std::make_shared(), // use property std::make_shared(), diff --git a/src/Optimizer/Property/PropertyDeriver.cpp b/src/Optimizer/Property/PropertyDeriver.cpp index c3f6397f7c5..eec948a741d 100644 --- a/src/Optimizer/Property/PropertyDeriver.cpp +++ b/src/Optimizer/Property/PropertyDeriver.cpp @@ -602,7 +602,9 @@ Property DeriverVisitor::visitReadNothingStep(const ReadNothingStep &, DeriverCo Property DeriverVisitor::visitReadStorageRowCountStep(const ReadStorageRowCountStep &, DeriverContext &) { - return Property{Partitioning(Partitioning::Handle::SINGLE), Partitioning(Partitioning::Handle::ARBITRARY)}; + auto prop = Partitioning(Partitioning::Handle::SINGLE); + prop.setComponent(Partitioning::Component::COORDINATOR); + return Property{prop, Partitioning(Partitioning::Handle::ARBITRARY)}; } Property DeriverVisitor::visitValuesStep(const ValuesStep &, DeriverContext &) diff --git a/src/Optimizer/QueryUseOptimizerChecker.cpp b/src/Optimizer/QueryUseOptimizerChecker.cpp index dbb286520da..38039ffd089 100644 --- a/src/Optimizer/QueryUseOptimizerChecker.cpp +++ b/src/Optimizer/QueryUseOptimizerChecker.cpp @@ -71,7 +71,7 @@ void turnOffOptimizer(ContextMutablePtr context, ASTPtr & node) changeASTSettings(node); } -static bool checkDatabaseAndTable(String database_name, String table_name, ContextMutablePtr context, const NameSet & ctes) +static bool checkDatabaseAndTable(String database_name, String table_name, ContextMutablePtr context, const NameSet & ctes, String & reason) { /// not with table if (database_name.empty() && ctes.contains(table_name)) @@ -84,7 +84,10 @@ static bool checkDatabaseAndTable(String database_name, String table_name, Conte database_name = context->getCurrentDatabase(); if (!storage_table) + { + reason = fmt::format("table not found: {}.{}", database_name, table_name); return false; + } if (database_name == "system") return true; @@ -95,11 +98,21 @@ static bool checkDatabaseAndTable(String database_name, String table_name, Conte auto subquery = table_metadata_snapshot->getSelectQuery().inner_query; QueryUseOptimizerVisitor checker; - QueryUseOptimizerContext check_context{.context = context}; - return ASTVisitorUtil::accept(subquery, checker, check_context); + QueryUseOptimizerContext check_context {.context = context}; + if (!ASTVisitorUtil::accept(subquery, checker, check_context)) + { + reason = checker.getReason(); + return false; + } + return true; } - return storage_table->supportsOptimizer(); + if (!storage_table->supportsOptimizer()) + { + reason = fmt::format("unsupport storage {}: {}.{}", storage_table->getName(), database_name, table_name); + return false; + } + return true; } bool QueryUseOptimizerChecker::check(ASTPtr node, ContextMutablePtr context, bool throw_exception) @@ -191,11 +204,8 @@ bool QueryUseOptimizerChecker::check(ASTPtr node, ContextMutablePtr context, boo if (database.empty()) database = context->getCurrentDatabase(); - if (!checkDatabaseAndTable(database, insert_query->table_id.getTableName(), context, {})) - { - reason = "unsupported storage, database: " + database + ", table: " + insert_query->table_id.getTableName(); + if (!checkDatabaseAndTable(database, insert_query->table_id.getTableName(), context, {}, reason)) support = false; - } } LOG_DEBUG( @@ -228,12 +238,13 @@ bool QueryUseOptimizerVisitor::visitNode(ASTPtr & node, QueryUseOptimizerContext return true; } -static bool checkDatabaseAndTable(const ASTTableExpression & table_expression, const ContextMutablePtr & context, const NameSet & ctes) +static bool +checkDatabaseAndTable(const ASTTableExpression & table_expression, const ContextMutablePtr & context, const NameSet & ctes, String & reason) { if (table_expression.database_and_table_name) { auto db_and_table = DatabaseAndTableWithAlias(table_expression.database_and_table_name); - return checkDatabaseAndTable(db_and_table.database, db_and_table.table, context, ctes); + return checkDatabaseAndTable(db_and_table.database, db_and_table.table, context, ctes, reason); } return true; } @@ -266,11 +277,9 @@ bool QueryUseOptimizerVisitor::visitASTSelectQuery(ASTPtr & node, QueryUseOptimi for (const auto * table_expression : getTableExpressions(*select)) { - if (!checkDatabaseAndTable(*table_expression, child_context.context, child_context.ctes)) - { - reason = "unsupported storage: " + table_expression->formatForErrorMessage(); + if (!checkDatabaseAndTable(*table_expression, child_context.context, child_context.ctes, reason)) return false; - } + if (table_expression->table_function) { const auto & function = table_expression->table_function->as(); @@ -316,11 +325,8 @@ bool QueryUseOptimizerVisitor::visitASTFunction(ASTPtr & node, QueryUseOptimizer { ASTTableExpression table_expression; table_expression.database_and_table_name = table; - if (!checkDatabaseAndTable(table_expression, context.context, context.ctes)) - { - reason = "unsupported storage: " + table_expression.formatForErrorMessage(); + if (!checkDatabaseAndTable(table_expression, context.context, context.ctes, reason)) return false; - } } } } diff --git a/src/Optimizer/Rewriter/AddBufferForDeadlockCTE.cpp b/src/Optimizer/Rewriter/AddBufferForDeadlockCTE.cpp index dae3f0324a3..8e64d75debe 100644 --- a/src/Optimizer/Rewriter/AddBufferForDeadlockCTE.cpp +++ b/src/Optimizer/Rewriter/AddBufferForDeadlockCTE.cpp @@ -334,7 +334,7 @@ namespace deadlock_ctes.emplace(node_id); } - left_ctes.insert(right_ctes.begin(), left_ctes.end()); + left_ctes.insert(right_ctes.begin(), right_ctes.end()); return left_ctes; } diff --git a/src/Optimizer/Rewriter/MaterializedViewRewriter.cpp b/src/Optimizer/Rewriter/MaterializedViewRewriter.cpp index ca450d98756..1d07f205782 100644 --- a/src/Optimizer/Rewriter/MaterializedViewRewriter.cpp +++ b/src/Optimizer/Rewriter/MaterializedViewRewriter.cpp @@ -22,6 +22,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -51,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -59,16 +64,16 @@ #include #include #include +#include #include #include #include #include #include #include -#include "Common/LinkedHashMap.h" #include +#include #include -#include "QueryPlan/CTEVisitHelper.h" #include #include @@ -312,6 +317,9 @@ class CandidatesExplorer : public PlanNodeVisitor JoinHyperGraph::MAX_NODE) return; + if (tables.size() > 1 && !context->getSettingsRef().enable_materialized_view_join_rewriting) + return; + std::vector related_materialized_views; std::shared_ptr query_aggregate = extractTopAggregate(query); @@ -532,6 +540,8 @@ class CandidatesExplorer : public PlanNodeVisitorgetKeys().size() < view_aggregate->getKeys().size() + && (!async_materialized_view || context->getSettingsRef().enforce_materialized_view_union_rewriting || !view_aggregate + || query_aggregate->getKeys().size() < view_aggregate->getKeys().size() || !PredicateUtils::isFalsePredicate(union_predicate)); // 3-1. query aggregate has default result if group by has empty set. not supported yet. @@ -773,13 +784,18 @@ class CandidatesExplorer : public PlanNodeVisitorgetSettingsRef().enforce_materialized_view_union_rewriting) { if (query_aggregate && query->getType() != IQueryPlanStep::Type::Aggregating) { add_failure_message("union rewrite for aggregating only supports query with aggregates on the top"); continue; // bail out } + if (query_aggregate && !view_aggregate) + { + add_failure_message("union rewrite for aggregating only supports view with aggregate"); + continue; // bail out + } EqualityASTMap query_output_columns_map; NameSet outputs; @@ -918,7 +934,7 @@ class CandidatesExplorer : public PlanNodeVisitor { auto & candidates = it->second; auto candidate_it = std::min_element(candidates.begin(), candidates.end(), RewriterCandidateSort()); - context->getOptimizerMetrics()->addMaterializedView(candidate_it->view_database_and_table_name); - return constructEquivalentPlan(node.shared_from_this(), *candidate_it); + LOG_INFO(log, "use materialized view {} for query rewriting", candidate_it->view_database_and_table_name.getFullTableName()); + try + { + auto plan = constructEquivalentPlan(node.shared_from_this(), *candidate_it); + context->getOptimizerMetrics()->addMaterializedView(candidate_it->view_database_and_table_name); + return plan; + } + catch (...) + { + tryLogCurrentException( + log, + "construct equivalent plan use materialized view failed: " + + candidate_it->view_database_and_table_name.getFullTableName()); + return SimplePlanRewriter::visitPlanNode(node, c); + } } return SimplePlanRewriter::visitPlanNode(node, c); } @@ -1285,7 +1314,8 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter assignments.emplace(assignemnt.first, symbol_mapper.map(assignemnt.second)); // plan aggregate + union rewrite before - if (!PredicateUtils::isFalsePredicate(candidate.union_predicate)) + if (!PredicateUtils::isFalsePredicate(candidate.union_predicate) + || context->getSettingsRef().enforce_materialized_view_union_rewriting) plan = planUnionBeforeAggragte(plan, query, candidate.union_predicate, assignments); plan = PlanNodeBase::createPlanNode( @@ -1303,7 +1333,8 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter {plan}); // simple union rewrite - if (!PredicateUtils::isFalsePredicate(candidate.union_predicate)) + if (!PredicateUtils::isFalsePredicate(candidate.union_predicate) + || context->getSettingsRef().enforce_materialized_view_union_rewriting) plan = planUnion(plan, query, candidate.union_predicate); // reallocate symbols @@ -1444,6 +1475,7 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter view_name_to_aggreagte.emplace(aggregate.column_name, &aggregate); AggregateDescriptions rewrite_aggregates; + Names group_keys{query_step.getKeys().begin(), query_step.getKeys().end()}; for (const auto & aggregate : query_step.getAggregates()) { const auto & view_aggregate_output = assignments.at(aggregate.column_name); @@ -1451,28 +1483,43 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter throw Exception("union rewrite failed: view_aggregate_output expected as identifier", ErrorCodes::LOGICAL_ERROR); const auto & view_aggregate_output_name = view_aggregate_output->as()->name(); const auto * view_aggreagte = view_name_to_aggreagte.at(view_aggregate_output_name); - if (view_aggreagte->argument_names.size() != 1) - throw Exception("size of rollup aggregate arguments expected be 1", ErrorCodes::LOGICAL_ERROR); - auto & inputs = output_to_inputs[view_aggreagte->argument_names[0]]; - inputs.emplace_back(view_aggreagte->argument_names[0]); - inputs.emplace_back(aggregate.column_name); - - // check whether need rewrite - if (view_aggreagte->function->getName().ends_with("Merge") && !aggregate.function->getName().ends_with("State")) + + auto rewrite = aggregate; + if (view_aggreagte->function->getName().ends_with("Merge")) { - auto rewrite = aggregate; + String function_name = view_aggreagte->function->getName().substr(0, view_aggreagte->function->getName().size() - 5) + "State"; AggregateFunctionProperties properties; rewrite.function = AggregateFunctionFactory::instance().get( - aggregate.function->getName() + "State", - aggregate.function->getArgumentTypes(), - aggregate.function->getParameters(), - properties); + function_name, aggregate.function->getArgumentTypes(), aggregate.function->getParameters(), properties); + } + + // direct rollup + if (view_aggreagte->function->getArgumentTypes().size() == 1 + && rewrite.function->getReturnType()->equals(*view_aggreagte->function->getArgumentTypes()[0])) + { rewrite_aggregates.emplace_back(rewrite); + auto & inputs = output_to_inputs[view_aggreagte->argument_names[0]]; + inputs.emplace_back(view_aggreagte->argument_names[0]); + inputs.emplace_back(aggregate.column_name); + continue; } - else + + // rollup on group by results + if (aggregate.function->getName() == view_aggreagte->function->getName() + && aggregate.function->getArgumentTypes() == view_aggreagte->function->getArgumentTypes() + && aggregate.function->getParameters() == view_aggreagte->function->getParameters()) { - rewrite_aggregates.emplace_back(aggregate); + for (size_t i = 0; i < view_aggreagte->argument_names.size(); i++) + { + auto & inputs = output_to_inputs[view_aggreagte->argument_names[i]]; + inputs.emplace_back(view_aggreagte->argument_names[i]); + inputs.emplace_back(aggregate.argument_names[i]); + + group_keys.emplace_back(aggregate.argument_names[i]); + } + continue; } + throw Exception("aggregate not support", ErrorCodes::LOGICAL_ERROR); } InsertFilterRewriter rewriter{context, union_predicate}; @@ -1483,7 +1530,7 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter context->nextNodeId(), std::make_shared( query_with_filter->getStep()->getOutputStream(), - query_step.getKeys(), + makeDistinct(group_keys), query_step.getKeysNotHashed(), rewrite_aggregates, query_step.getGroupingSetsParams(), @@ -1518,6 +1565,22 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter return PlanNodeBase::createPlanNode(context->nextNodeId(), view->getStep(), {unionn}); } + static std::vector makeDistinct(const std::vector & src) + { + std::unordered_set tmp; + std::vector result; + + for (const auto & str : src) + { + if (tmp.insert(str).second) + { + result.push_back(str); + } + } + + return result; + } + // Union + Rollup Rewrite PlanNodePtr planUnion(const PlanNodePtr & view, const PlanNodePtr & query, ASTPtr union_predicate) { @@ -1603,6 +1666,7 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter private: std::unordered_map & match_results; + LoggerPtr log = getLogger("MaterializedViewCostBasedRewriter"); class AggregateRewriter : public SimpleExpressionRewriter { @@ -1725,6 +1789,21 @@ class CostBasedMaterializedViewRewriter : public SimplePlanRewriter } bool MaterializedViewRewriter::rewrite(QueryPlan & plan, ContextMutablePtr context) const +{ + try + { + return rewriteImpl(plan, context); + } + catch (...) + { + if (context->getSettingsRef().enforce_materialized_view_rewrite) + throw; + tryLogCurrentException(log, "materialized view rewrite failed."); + return false; + } +} + +bool MaterializedViewRewriter::rewriteImpl(QueryPlan & plan, ContextMutablePtr context) const { bool enforce = context->getSettingsRef().enforce_materialized_view_rewrite; bool verbose = context->getSettingsRef().enable_materialized_view_rewrite_verbose_log; diff --git a/src/Optimizer/Rewriter/MaterializedViewRewriter.h b/src/Optimizer/Rewriter/MaterializedViewRewriter.h index 4ec3868b054..dc7cf9950d7 100644 --- a/src/Optimizer/Rewriter/MaterializedViewRewriter.h +++ b/src/Optimizer/Rewriter/MaterializedViewRewriter.h @@ -39,10 +39,14 @@ class MaterializedViewRewriter : public Rewriter private: bool rewrite(QueryPlan & plan, ContextMutablePtr context) const override; + + bool rewriteImpl(QueryPlan & plan, ContextMutablePtr context) const; + bool isEnabled(ContextMutablePtr context) const override { return context->getSettingsRef().enable_materialized_view_rewrite || context->getSettingsRef().enable_view_based_query_rewrite; } + LinkedHashMap getRelatedMaterializedViews(QueryPlan & plan, ContextMutablePtr context) const; diff --git a/src/Optimizer/Rewriter/OptimizeTrivialCount.cpp b/src/Optimizer/Rewriter/OptimizeTrivialCount.cpp index c62b6a07260..a9d4b7f1e47 100644 --- a/src/Optimizer/Rewriter/OptimizeTrivialCount.cpp +++ b/src/Optimizer/Rewriter/OptimizeTrivialCount.cpp @@ -136,13 +136,12 @@ PlanNodePtr TrivialCountVisitor::visitAggregatingNode(AggregatingNode & node, Vo if (!num_rows) return visitPlanNode(node, v); - DatabaseAndTableName database_and_table = {storage->getDatabaseName(), storage->getTableName()}; auto read_row_count= std::make_shared(node.getCurrentDataStream().header, select_query.clone(), agg_step.getParams().aggregates[0], - num_rows.value(), agg_step.isFinal(), - database_and_table); + storage->getStorageID()); + read_row_count->setNumRows(num_rows.value()); auto new_child_node= PlanNodeBase::createPlanNode(context->nextNodeId(), std::move(read_row_count), {}); return new_child_node->shared_from_this(); } diff --git a/src/Optimizer/Rewriter/ShareCommonExpression.cpp b/src/Optimizer/Rewriter/ShareCommonExpression.cpp index 0dab9059d2c..c777c6c0cf4 100644 --- a/src/Optimizer/Rewriter/ShareCommonExpression.cpp +++ b/src/Optimizer/Rewriter/ShareCommonExpression.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,12 @@ namespace return !ExpressionDeterminism::canChangeOutputRows(filter.getFilter(), context); } + if (type == IQueryPlanStep::Type::TableScan) + { + const auto & table_scan = dynamic_cast(*node->getStep()); + return !!dynamic_cast(table_scan.getStorage().get()); + } + const static std::unordered_set sharable_steps{ // TODO: some steps are not supported by SymbolTransformMap IQueryPlanStep::Type::Filter, diff --git a/src/Optimizer/Rule/Rewrite/PushIntoTableScanRules.cpp b/src/Optimizer/Rule/Rewrite/PushIntoTableScanRules.cpp index bee71b288f5..ce4c0479303 100644 --- a/src/Optimizer/Rule/Rewrite/PushIntoTableScanRules.cpp +++ b/src/Optimizer/Rule/Rewrite/PushIntoTableScanRules.cpp @@ -280,6 +280,9 @@ TransformResult PushIndexProjectionIntoTableScan::transformImpl(PlanNodePtr node auto copy_step = node->getChildren()[0]->getStep()->copy(rule_context.context); auto * copy_table_step = dynamic_cast(copy_step.get()); + if (!dynamic_cast(copy_table_step->getStorage().get())) + return {}; + if (copy_table_step->hasInlineExpressions()) return {}; diff --git a/src/Optimizer/tests/gtest_materialized_view_complex_rewrite.cpp b/src/Optimizer/tests/gtest_materialized_view_complex_rewrite.cpp index e610d5194cb..6b53fe1f062 100644 --- a/src/Optimizer/tests/gtest_materialized_view_complex_rewrite.cpp +++ b/src/Optimizer/tests/gtest_materialized_view_complex_rewrite.cpp @@ -35,6 +35,7 @@ class MaterializedViewRewriteComplexTest : public ::testing::Test #endif settings.emplace("enable_materialized_view_rewrite", "1"); settings.emplace("enable_materialized_view_join_rewriting", "1"); + settings.emplace("enable_materialized_view_union_rewriting", "1"); settings.emplace("enable_materialized_view_rewrite_verbose_log", "1"); settings.emplace("enable_single_distinct_to_group_by", "0"); settings.emplace("materialized_view_consistency_check_method", "NONE"); diff --git a/src/Optimizer/tests/gtest_materialized_view_rewrite.cpp b/src/Optimizer/tests/gtest_materialized_view_rewrite.cpp index a9975afb090..49e67a9bb4d 100644 --- a/src/Optimizer/tests/gtest_materialized_view_rewrite.cpp +++ b/src/Optimizer/tests/gtest_materialized_view_rewrite.cpp @@ -32,6 +32,7 @@ class MaterializedViewRewriteTest : public ::testing::Test #endif settings.emplace("enable_materialized_view_rewrite", "1"); settings.emplace("enable_materialized_view_join_rewriting", "1"); + settings.emplace("enable_materialized_view_union_rewriting", "1"); settings.emplace("enable_materialized_view_rewrite_verbose_log", "1"); settings.emplace("enable_single_distinct_to_group_by", "0"); settings.emplace("enum_replicate_no_stats", "0"); diff --git a/src/Optimizer/tests/gtest_materialized_view_rewrite_additional.cpp b/src/Optimizer/tests/gtest_materialized_view_rewrite_additional.cpp index 8b52e0dec51..321cc438576 100644 --- a/src/Optimizer/tests/gtest_materialized_view_rewrite_additional.cpp +++ b/src/Optimizer/tests/gtest_materialized_view_rewrite_additional.cpp @@ -20,6 +20,7 @@ class MaterializedViewRewriteAdditionalTest : public ::testing::Test #endif settings.emplace("enable_materialized_view_rewrite", "1"); settings.emplace("enable_materialized_view_join_rewriting", "1"); + settings.emplace("enable_materialized_view_union_rewriting", "1"); settings.emplace("enable_materialized_view_rewrite_verbose_log", "1"); settings.emplace("materialized_view_consistency_check_method", "NONE"); settings.emplace("cte_mode", "INLINED"); diff --git a/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.cpp b/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.cpp index ee265d1010f..104aae84b29 100644 --- a/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.cpp +++ b/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.cpp @@ -105,7 +105,7 @@ void BrpcExchangeReceiverRegistryService::registerBRPCSenderFromDisk( client_info.initial_query_id = request->registry().query_id(); /// needed for query exchange log initial_query_id client_info.initial_query_start_time = initial_query_start_time_microseconds / 1000000; client_info.initial_query_start_time_microseconds = initial_query_start_time_microseconds; - query_context->setQueryExpirationTimeStamp(); + query_context->initQueryExpirationTimeStamp(); auto query_id = request->registry().query_id(); auto coordinator_addr = request->registry().coordinator_address(); /// we need to do this as to avoid previous sender waitBecomeRealSender in finish diff --git a/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.h b/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.h index 4090c45dbb7..52e945bd187 100644 --- a/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.h +++ b/src/Processors/Exchange/DataTrans/Brpc/BrpcExchangeReceiverRegistryService.h @@ -37,7 +37,7 @@ class BrpcExchangeReceiverRegistryService : public Protos::RegistryService DISK_READER = 1 }; explicit BrpcExchangeReceiverRegistryService(ContextMutablePtr context_) - : context(context_), max_buf_size(context_->getSettingsRef().exchange_stream_max_buf_size) + : context(std::move(context_)), max_buf_size(context->getSettingsRef().exchange_stream_max_buf_size) { } explicit BrpcExchangeReceiverRegistryService(int max_buf_size_) : max_buf_size(max_buf_size_) diff --git a/src/Processors/Exchange/DataTrans/RpcChannelPool.cpp b/src/Processors/Exchange/DataTrans/RpcChannelPool.cpp index c761416843c..d6d70a31f56 100644 --- a/src/Processors/Exchange/DataTrans/RpcChannelPool.cpp +++ b/src/Processors/Exchange/DataTrans/RpcChannelPool.cpp @@ -152,7 +152,9 @@ std::shared_ptr RpcChannelPool::getClient(const String & host_port, c } client = std::make_shared( - host_port, [pool]() -> void { pool->ok_.store(false, std::memory_order_relaxed); }, &connection_pool_options); + host_port, + [pool = std::move(pool)]() -> void { pool->ok_.store(false, std::memory_order_relaxed); }, + &connection_pool_options); std::atomic_store(&pool_clients[index], client); return client; } diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 4fd7d52ac2d..ab54cad9a53 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -170,7 +170,7 @@ void PipelineExecutor::addJob(ExecutingGraph::Node * execution_state) PlanSegmentExecutionInfo info{ .execution_address = AddressInfo(getHostIPFromEnv(), query_context->getTCPPort(), "", "", query_context->getExchangePort())}; - auto result = convertFailurePlanSegmentStatusToResult(query_context, info, exception_code, exception_message); + auto result = convertFailurePlanSegmentStatusToResult(std::move(query_context), info, exception_code, exception_message); reportExecutionResult(result); } } diff --git a/src/Processors/tests/gtest_exchange_sink.cpp b/src/Processors/tests/gtest_exchange_sink.cpp index fb990c4c51d..6d412e7bdd0 100644 --- a/src/Processors/tests/gtest_exchange_sink.cpp +++ b/src/Processors/tests/gtest_exchange_sink.cpp @@ -77,7 +77,7 @@ TEST(ExchangeSink, BroadcastExchangeSinkTest) Chunk chunk = createUInt8Chunk(10, 1, 8); auto total_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); @@ -129,7 +129,7 @@ TEST(ExchangeSink, BroadcastExchangeSinkBufferTest) Chunk chunk = createUInt8Chunk(10, 1, 8); auto total_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); @@ -181,7 +181,7 @@ TEST(ExchangeSink, LoadBalancedExchangeSinkTest) Chunk chunk = createUInt8Chunk(10, 1, 8); auto total_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); @@ -240,7 +240,7 @@ TEST(ExchangeSink, MultiPartitionExchangeSinkTest) auto func = createRepartitionFunction(getContext().context, arguments); auto total_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); @@ -306,7 +306,7 @@ TEST(ExchangeSink, SinglePartitionExchangeSinkNormalTest) auto func = createRepartitionFunction(getContext().context, arguments); auto total_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); @@ -378,7 +378,7 @@ TEST(ExchangeSink, SinglePartitionExchangeSinkPipelineTest) auto func = createRepartitionFunction(getContext().context, arguments); auto chunk_bytes = chunk.bytes(); - setQueryDuration(); + setQueryDuration(context); for (int i = 0; i < 5; i++) { BroadcastStatus status = source_sender->send(chunk.clone()); diff --git a/src/Processors/tests/gtest_exchange_trans.cpp b/src/Processors/tests/gtest_exchange_trans.cpp index f02f1915669..cac8460eea1 100644 --- a/src/Processors/tests/gtest_exchange_trans.cpp +++ b/src/Processors/tests/gtest_exchange_trans.cpp @@ -123,7 +123,7 @@ TEST_F(ExchangeRemoteTest, SendWithTwoReceivers) auto sender_2 = BroadcastSenderProxyRegistry::instance().getOrCreate(receiver_data2); sender_1->accept(getContext().context, header); sender_2->accept(getContext().context, header); - setQueryDuration(); + setQueryDuration(getContext().context); sender_1->send(origin_chunk.clone()); sender_2->send(origin_chunk.clone()); @@ -160,6 +160,7 @@ TEST_F(ExchangeRemoteTest, SerDserChunk) void sender_thread(BroadcastSenderProxyPtr sender, Chunk chunk) { + setQueryDuration(getContext().context); BroadcastStatus status = sender->send(std::move(chunk)); } diff --git a/src/Processors/tests/gtest_rpc_channel_pool.cpp b/src/Processors/tests/gtest_rpc_channel_pool.cpp index e9120459790..424eb91ad45 100644 --- a/src/Processors/tests/gtest_rpc_channel_pool.cpp +++ b/src/Processors/tests/gtest_rpc_channel_pool.cpp @@ -81,7 +81,7 @@ void get_client( TEST_F(RPCchannelPoolTest, single_address_concurrent) { std::vector thread_get_clients; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < 100; i++) { std::thread thread_get_client(get_client, 10000, "127.0.0.1:8001", BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY, false, false); thread_get_clients.push_back(std::move(thread_get_client)); @@ -98,7 +98,7 @@ TEST_F(RPCchannelPoolTest, multi_address_concurrent) = {BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY, BrpcChannelPoolOptions::STREAM_DEFAULT_CONFIG_KEY}; std::vector thread_get_clients; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < 100; i++) { auto address = "127.0.0.1:80" + std::to_string(i % 100); std::thread thread_get_client(get_client, 10000, address, client_types[i % 2], false, false); @@ -118,7 +118,7 @@ TEST_F(RPCchannelPoolTest, check_pool_expire_timer_concurrent) = {BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY, BrpcChannelPoolOptions::STREAM_DEFAULT_CONFIG_KEY}; std::vector thread_get_clients; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < 100; i++) { auto address = "127.0.0.1:80" + std::to_string(i % 100); std::thread thread_get_client(get_client, 10000, address, client_types[i % 2], false, false); @@ -138,7 +138,7 @@ TEST_F(RPCchannelPoolTest, check_pool_expire_timer) = {BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY, BrpcChannelPoolOptions::STREAM_DEFAULT_CONFIG_KEY}; std::vector thread_get_clients; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < 100; i++) { auto address = "127.0.0.1:80" + std::to_string(i % 100); std::thread thread_get_client(get_client, 10000, address, client_types[i % 2], true, false); @@ -156,7 +156,7 @@ TEST_F(RPCchannelPoolTest, construct_random_exceptions) = {BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY, BrpcChannelPoolOptions::STREAM_DEFAULT_CONFIG_KEY}; std::vector thread_get_clients; - for (int i = 0; i < 1000; i++) + for (int i = 0; i < 100; i++) { auto address = "127.0.0.1:80" + std::to_string(i % 100); std::thread thread_get_client(get_client, 1000, address, client_types[i % 2], false, true); diff --git a/src/Protos/cnch_common.proto b/src/Protos/cnch_common.proto index 54d05ef72ba..cbf0eb2336e 100644 --- a/src/Protos/cnch_common.proto +++ b/src/Protos/cnch_common.proto @@ -147,3 +147,7 @@ message LeaderInfo { optional LeaderLease previous_leader_lease = 4; } +enum ResourceRequestType { + RESOURCE_REQUEST = 0; + RESOURCE_RELEASE = 1; +}; diff --git a/src/Protos/disk_cache.proto b/src/Protos/disk_cache.proto index 1739a07a37d..254ebc27327 100644 --- a/src/Protos/disk_cache.proto +++ b/src/Protos/disk_cache.proto @@ -70,6 +70,33 @@ message NexusFSIndexBucket { repeated NexusFSIndexEntry entries = 2; } +message NexusFSFileSegment { + required uint64 segment_id = 1; + required uint32 address_rid = 2; + required uint32 address_offset = 3; + required uint32 size = 4; +} + +message NexusFSFileMeta +{ + required string file_name = 1; + required uint64 file_size = 2; + repeated NexusFSFileSegment segments = 3; +} + +message NexusFSInode { + required string node_key = 1; + required uint64 node_id = 2; + repeated NexusFSFileMeta files = 3; +} + +message NexusFSInodeManager { + required string prefix = 1; + required string surfix = 2; + required NexusFSInode root_inode = 3; + repeated NexusFSInode inodes = 4; +} + message NexusFSConfig { required uint64 version = 1; required uint64 cache_size = 2; @@ -77,8 +104,6 @@ message NexusFSConfig { required uint32 alloc_align_size = 4; required uint32 region_size = 5; required uint32 segment_size = 6; - optional uint64 hole_count = 7; - optional uint64 hole_size_total = 8; - optional bool reinsertion_policy_enabled = 9; - optional uint64 used_size_bytes = 10; + required string file_prefix = 7; + required string file_surfix = 8; } diff --git a/src/Protos/plan_node.proto b/src/Protos/plan_node.proto index 49db172eb26..5fa6ad13531 100644 --- a/src/Protos/plan_node.proto +++ b/src/Protos/plan_node.proto @@ -367,6 +367,7 @@ message ReadStorageRowCountStep { required AggregateDescription agg_desc = 4; required uint64 num_rows = 5; required bool is_final_agg = 6; + optional StorageID storage_id = 7; } message WindowStep { diff --git a/src/Protos/plan_segment_manager.proto b/src/Protos/plan_segment_manager.proto index 1c6f268efec..31867bd7b3c 100644 --- a/src/Protos/plan_segment_manager.proto +++ b/src/Protos/plan_segment_manager.proto @@ -4,6 +4,7 @@ package DB.Protos; import "data_models.proto"; import "plan_node_utils.proto"; +import "cnch_common.proto"; import "plan_node.proto"; import "enum.proto"; @@ -272,6 +273,23 @@ message SendProgressResponse { optional string message = 1; } +// ----- start resource aware scheduling ----- +message GrantResourceRequestReq +{ + optional ResourceRequestType req_type = 1; + optional string query_id = 2; + optional uint64 query_start_ts = 3; + optional uint32 segment_id = 4; + optional uint32 parallel_index = 5; + optional uint32 epoch = 6; + optional bool ok = 7; +} + +message GrantResourceRequestResp +{ +} +// ----- end resource aware scheduling ----- + service PlanSegmentManagerService { rpc submitPlanSegment(SubmitPlanSegmentRequest) returns (ExecutePlanSegmentResponse); @@ -291,4 +309,8 @@ service PlanSegmentManagerService { rpc sendProgress(SendProgressRequest) returns (SendProgressResponse); rpc sendPlanSegmentProfile(PlanSegmentProfileRequest) returns (PlanSegmentProfileResponse); + + // ----- start resource aware scheduling ----- + rpc grantResourceRequest(GrantResourceRequestReq) returns (GrantResourceRequestResp); + // ----- end resource aware scheduling ----- }; diff --git a/src/Protos/resource_manager_rpc.proto b/src/Protos/resource_manager_rpc.proto index bb6f0ea3f73..78ef4b60036 100644 --- a/src/Protos/resource_manager_rpc.proto +++ b/src/Protos/resource_manager_rpc.proto @@ -4,6 +4,7 @@ package DB.Protos; option cc_generic_services = true; import "cnch_common.proto"; +import "plan_node_utils.proto"; import "data_models.proto"; /** RPC functions for the Resource Manager @@ -127,6 +128,27 @@ message GetAllWorkersResp repeated WorkerNodeResourceData worker_data = 4; } +message SendResourceRequestReq +{ + optional AddressInfo server_addr = 1; + optional ResourceRequestType req_type = 2; // reqeust, release + optional string query_id = 3; + optional uint32 query_start_ts = 4; + optional uint32 segment_id = 5; + optional uint32 parallel_index = 6; + optional string worker_id = 7; + optional uint32 request_vcpu = 8; + optional uint32 request_mem = 9; + optional uint32 epoch = 10; +} + +message SendResourceRequestResp +{ + optional bool is_leader = 1; + optional string leader_host_port = 2; + optional string exception = 3; +} + message GetWorkerGroupsReq { required string vw_name = 1; @@ -211,6 +233,7 @@ service ResourceManagerService rpc getAllVirtualWarehouses(GetAllVirtualWarehousesReq) returns (GetAllVirtualWarehousesResp); rpc getAllWorkers(GetAllWorkersReq) returns (GetAllWorkersResp); + rpc sendResourceRequest(SendResourceRequestReq) returns (SendResourceRequestResp); rpc createWorkerGroup(CreateWorkerGroupReq) returns (CreateWorkerGroupResp); rpc dropWorkerGroup(DropWorkerGroupReq) returns (DropWorkerGroupResp); diff --git a/src/QueryPlan/GraphvizPrinter.cpp b/src/QueryPlan/GraphvizPrinter.cpp index 3327c7a6c93..2f19043776c 100644 --- a/src/QueryPlan/GraphvizPrinter.cpp +++ b/src/QueryPlan/GraphvizPrinter.cpp @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -2088,9 +2089,9 @@ String StepPrinter::printTableScanStep(const TableScanStep & step) String StepPrinter::printReadStorageRowCountStep(const ReadStorageRowCountStep & step) { - auto database_and_table = step.getDatabaseAndTableName(); + auto storage_id = step.getStorageID(); std::stringstream details; - details << database_and_table.first << "." << database_and_table.second << "|"; + details << storage_id.getFullTableName() << "|"; auto ast = step.getQuery(); auto * query = ast->as(); @@ -3936,6 +3937,7 @@ String GraphvizPrinter::printGroup(const Group & group, const std::unordered_map auto property_str = [&](const Property & property) { std::stringstream ss; ss << property.getNodePartitioning().toString(); + ss << " Component:" << magic_enum::enum_name(property.getNodePartitioning().getComponent()) << "; "; ss << " "; ss << property.getCTEDescriptions().toString(); return ss.str(); diff --git a/src/QueryPlan/LineageInfo.cpp b/src/QueryPlan/LineageInfo.cpp index 76ef5bfad3f..203f5f2c8a4 100644 --- a/src/QueryPlan/LineageInfo.cpp +++ b/src/QueryPlan/LineageInfo.cpp @@ -77,7 +77,7 @@ void LineageInfoVisitor::visitValuesNode(ValuesNode & node, LineageInfoContext & void LineageInfoVisitor::visitReadStorageRowCountNode(ReadStorageRowCountNode & node, LineageInfoContext & lineage_info_context) { const auto * step = node.getStep().get(); - visitISourceNodeWithoutStorage(node, lineage_info_context, {step->getDatabaseAndTableName().first, step->getDatabaseAndTableName().second}); + visitISourceNodeWithoutStorage(node, lineage_info_context, {step->getStorageID().getDatabaseName(), step->getStorageID().getTableName()}); } void LineageInfoVisitor::visitExtremesNode(ExtremesNode & node, LineageInfoContext & lineage_info_context) diff --git a/src/QueryPlan/PlanCache.cpp b/src/QueryPlan/PlanCache.cpp index 3df6b46755c..26c7153eec0 100644 --- a/src/QueryPlan/PlanCache.cpp +++ b/src/QueryPlan/PlanCache.cpp @@ -1,11 +1,12 @@ #include -#include +#include #include #include #include -#include #include #include +#include +#include namespace DB { @@ -85,7 +86,6 @@ PlanNodePtr PlanCacheManager::getNewPlanNode(PlanNodePtr node, ContextMutablePtr if (result_node) children.emplace_back(result_node); } - return PlanNodeBase::createPlanNode(node->getId(), node->getStep()->copy(context), children); } @@ -117,6 +117,17 @@ QueryPlanPtr PlanCacheManager::getPlanFromCache(UInt128 query_hash, ContextMutab if (!plan_object || !plan_object->plan_root) return nullptr; + // check storage version + for (auto & item : plan_object->query_info->tables_version) + { + auto storage = DatabaseCatalog::instance().tryGetTable(item.first, context); + if (!storage || storage->latest_version.toUInt64() != item.second) + { + cached.remove(query_hash); + return nullptr; + } + } + // check statistic version for (auto & item : plan_object->query_info->stats_version) { @@ -129,6 +140,7 @@ QueryPlanPtr PlanCacheManager::getPlanFromCache(UInt128 query_hash, ContextMutab return nullptr; } } + PlanNodeId max_id; auto root = PlanCacheManager::getNewPlanNode(plan_object->plan_root, context, false, max_id); CTEInfo cte_info; @@ -173,14 +185,14 @@ bool PlanCacheManager::addPlanToCache(UInt128 query_hash, QueryPlanPtr & plan, A for (const auto & cte : plan->getCTEInfo().getCTEs()) plan_object.cte_map.emplace(cte.first, PlanCacheManager::getNewPlanNode(cte.second, context, true, max_id)); - plan_object.query_info = std::make_shared(); + plan_object.query_info = std::make_shared(); const auto & used_columns_map = analysis->getUsedColumns(); for (const auto & [table_ast, storage_analysis] : analysis->getStorages()) { if (!storage_analysis.storage) continue; auto storage_id = storage_analysis.storage->getStorageID(); - if (auto it = used_columns_map.find(storage_analysis.storage->getStorageID()); it != used_columns_map.end()) + if (auto it = used_columns_map.find(storage_id); it != used_columns_map.end()) { for (const auto & column : it->second) plan_object.query_info->query_access_info[backQuoteIfNeed(storage_id.getDatabaseName())][storage_id.getFullTableName()].emplace_back(column); @@ -190,6 +202,7 @@ bool PlanCacheManager::addPlanToCache(UInt128 query_hash, QueryPlanPtr & plan, A auto version_value = Statistics::getVersion(context, table_identifier); Int64 version = version_value.has_value() ? version_value.value().convertTo() : 0; plan_object.query_info->stats_version[storage_id] = version; + plan_object.query_info->tables_version[storage_id] = storage_analysis.storage->latest_version.toUInt64(); } cache.add(query_hash, plan_object); return true; diff --git a/src/QueryPlan/PlanCache.h b/src/QueryPlan/PlanCache.h index 508d1ec3851..cdb37eeb862 100644 --- a/src/QueryPlan/PlanCache.h +++ b/src/QueryPlan/PlanCache.h @@ -1,9 +1,11 @@ #pragma once +#include #include +#include #include #include -#include +#include namespace DB { @@ -20,17 +22,18 @@ namespace PlanCacheConfig class PlanCacheManager { public: - struct QueryInfo + struct PlanCacheInfo { // database_name->table_name->column_names std::unordered_map>> query_access_info; std::unordered_map stats_version; + std::unordered_map tables_version; }; struct PlanObjectValue { PlanNodePtr plan_root; std::unordered_map cte_map; - std::shared_ptr query_info; + std::shared_ptr query_info; }; using CacheType = Poco::ExpireLRUCache; diff --git a/src/QueryPlan/QueryPlanner.cpp b/src/QueryPlan/QueryPlanner.cpp index 0d36ca7bb0a..dfccc9c9b06 100644 --- a/src/QueryPlan/QueryPlanner.cpp +++ b/src/QueryPlan/QueryPlanner.cpp @@ -343,6 +343,11 @@ RelationPlan QueryPlannerVisitor::visitASTInsertQuery(ASTPtr & node, const Void insert_node->getCurrentDataStream(), target, total_affected_row_count_symbol, node, insert_select_with_profiles), {insert_node}); + if (auto table_finish = std::dynamic_pointer_cast(return_node->getStep())) + { + table_finish->preExecute(context); + } + PRINT_PLAN(return_node, plan_insert); return {return_node, {}}; } diff --git a/src/QueryPlan/ReadStorageRowCountStep.cpp b/src/QueryPlan/ReadStorageRowCountStep.cpp index 9bc5d3cb522..4c23b824b27 100644 --- a/src/QueryPlan/ReadStorageRowCountStep.cpp +++ b/src/QueryPlan/ReadStorageRowCountStep.cpp @@ -13,6 +13,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -21,22 +25,94 @@ namespace DB { -ReadStorageRowCountStep::ReadStorageRowCountStep(Block output_header, ASTPtr query_, AggregateDescription agg_desc_, UInt64 num_rows_, bool is_final_agg_, DatabaseAndTableName database_and_table_) +namespace ErrorCodes +{ + extern const int INCORRECT_RESULT_OF_SCALAR_SUBQUERY; +} + +ReadStorageRowCountStep::ReadStorageRowCountStep(Block output_header, ASTPtr query_, AggregateDescription agg_desc_, bool is_final_agg_, StorageID storage_id_) : ISourceStep(DataStream{.header = output_header}) , query(query_) , agg_desc(agg_desc_) - , num_rows(num_rows_) , is_final_agg(is_final_agg_) - , database_and_table(database_and_table_) + , storage_id(storage_id_) { } std::shared_ptr ReadStorageRowCountStep::copy(ContextPtr ) const { - return std::make_shared(output_stream->header, query, agg_desc, num_rows, is_final_agg, database_and_table); + auto step = std::make_shared(output_stream->header, query, agg_desc, is_final_agg, storage_id); + step->setNumRows(num_rows); + return step; } + void ReadStorageRowCountStep::initializePipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings & context) { + if (storage_id) + { + std::optional rows_cnt{}; + // get storage + StoragePtr storage = DatabaseCatalog::instance().getTable(storage_id, context.context); + + // get row number + auto & select_query = query->as(); + if (!select_query.where() && !select_query.prewhere()) + { + rows_cnt = storage->totalRows(context.context); + } + else // It's possible to optimize count() given only partition predicates + { + auto interpreter = std::make_shared(query->clone(), context.context, SelectQueryOptions()); + SelectQueryInfo temp_query_info; + temp_query_info.query = interpreter->getQuery(); + temp_query_info.syntax_analyzer_result = interpreter->getSyntaxAnalyzerResult(); + temp_query_info.sets = interpreter->getQueryAnalyzer()->getPreparedSets(); + rows_cnt = storage->totalRowsByPartitionPredicate(temp_query_info, context.context); + } + + if (!rows_cnt) + { + try + { + auto select_list = std::make_shared(); + auto count_func = makeASTFunction("count"); + select_query.refSelect() = std::make_shared(); + select_query.refSelect()->children.emplace_back(count_func); + DataTypes types; + auto pre_execute + = [&types](InterpreterSelectQueryUseOptimizer & interpreter) { types = interpreter.getSampleBlock().getDataTypes(); }; + + auto query_context = createContextForSubQuery(context.context); + SettingsChanges changes; + changes.emplace_back("max_result_rows", 1); + changes.emplace_back("result_overflow_mode", "throw"); + changes.emplace_back("extremes", false); + changes.emplace_back("optimize_trivial_count_query", false); + query_context->applySettingsChanges(changes); + auto block = executeSubPipelineWithOneRow(query, query_context, pre_execute); + + if (block.rows() != 1 || block.columns() != 1) + throw Exception( + ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY, + "Trivial count query returned error data: {}", + select_query.formatForErrorMessage()); + + block = materializeBlock(block); + auto columns = block.getColumns(); + num_rows = columns[0]->getUInt(0); + } + catch (...) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Trivial count query execution failed. Please set optimize_trivial_count_query = 0 , and try again.", + select_query.formatForErrorMessage()); + } + } + else + num_rows = rows_cnt.value(); + } + const auto & func = agg_desc.function; const AggregateFunctionCount & agg_count = static_cast(*func); Block output_header; @@ -82,10 +158,12 @@ void ReadStorageRowCountStep::toProto(Protos::ReadStorageRowCountStep & proto, b agg_desc.toProto(*proto.mutable_agg_desc()); proto.set_num_rows(num_rows); proto.set_is_final_agg(is_final_agg); + if (storage_id) + storage_id.toProto(*proto.mutable_storage_id()); } std::shared_ptr -ReadStorageRowCountStep::fromProto(const Protos::ReadStorageRowCountStep & proto, ContextPtr) +ReadStorageRowCountStep::fromProto(const Protos::ReadStorageRowCountStep & proto, ContextPtr context) { auto base_output_header = ISourceStep::deserializeFromProtoBase(proto.query_plan_base()); auto query = deserializeASTFromProto(proto.query()); @@ -93,8 +171,11 @@ ReadStorageRowCountStep::fromProto(const Protos::ReadStorageRowCountStep & proto agg_desc.fillFromProto(proto.agg_desc()); auto num_rows = proto.num_rows(); bool is_final = proto.is_final_agg(); - auto step = std::make_shared(base_output_header, query, agg_desc, num_rows, is_final); - + StorageID storage_id = StorageID::createEmpty(); + if (proto.has_storage_id()) + storage_id = StorageID::fromProto(proto.storage_id(), context); + auto step = std::make_shared(base_output_header, query, agg_desc, is_final, storage_id); + step->setNumRows(num_rows); return step; } } diff --git a/src/QueryPlan/ReadStorageRowCountStep.h b/src/QueryPlan/ReadStorageRowCountStep.h index 758eaeaf52c..a287064a4ed 100644 --- a/src/QueryPlan/ReadStorageRowCountStep.h +++ b/src/QueryPlan/ReadStorageRowCountStep.h @@ -9,12 +9,11 @@ namespace DB { -using DatabaseAndTableName = std::pair; class ReadStorageRowCountStep : public ISourceStep { public: - explicit ReadStorageRowCountStep(Block output_header, ASTPtr query_, AggregateDescription agg_desc_, UInt64 num_rows_, bool is_final_agg_, DatabaseAndTableName database_and_table_ = {}); + explicit ReadStorageRowCountStep(Block output_header, ASTPtr query_, AggregateDescription agg_desc_, bool is_final_agg_, StorageID storage_id_); String getName() const override { return "ReadStorageRowCount"; } @@ -24,7 +23,9 @@ class ReadStorageRowCountStep : public ISourceStep ASTPtr getQuery() const { return query; } - DatabaseAndTableName getDatabaseAndTableName() const { return database_and_table; } + StorageID getStorageID() const { return storage_id; } + + void setNumRows(UInt64 num_rows_) { num_rows = num_rows_; } UInt64 getNumRows() const { return num_rows; } @@ -43,7 +44,7 @@ class ReadStorageRowCountStep : public ISourceStep std::shared_ptr optimized_cluster; UInt64 num_rows; bool is_final_agg; - DatabaseAndTableName database_and_table; + StorageID storage_id; }; } diff --git a/src/QueryPlan/SymbolMapper.cpp b/src/QueryPlan/SymbolMapper.cpp index 499280214a6..c50d3eb8b2a 100644 --- a/src/QueryPlan/SymbolMapper.cpp +++ b/src/QueryPlan/SymbolMapper.cpp @@ -796,8 +796,10 @@ std::shared_ptr SymbolMapper::map(const OutfileFinishStep & s std::shared_ptr SymbolMapper::map(const ReadStorageRowCountStep & step) { - return std::make_shared( - map(step.getOutputStream().header), step.getQuery(), step.getAggregateDescription(), step.getNumRows(), step.isFinal(), step.getDatabaseAndTableName()); + auto new_step = std::make_shared( + map(step.getOutputStream().header), step.getQuery(), step.getAggregateDescription(), step.isFinal(), step.getStorageID()); + new_step->setNumRows(step.getNumRows()); + return new_step; } std::shared_ptr SymbolMapper::map(const BufferStep & step) diff --git a/src/QueryPlan/TableFinishStep.cpp b/src/QueryPlan/TableFinishStep.cpp index fd4cfdd825e..ce19f8a5b17 100644 --- a/src/QueryPlan/TableFinishStep.cpp +++ b/src/QueryPlan/TableFinishStep.cpp @@ -3,7 +3,17 @@ #include #include #include -#include "Processors/Transforms/TableFinishTransform.h" +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const int TableFinishStepPreClearHDFSTableMicroseconds; + extern const int TableFinishStepPreClearS3TableMicroseconds; +} namespace DB { @@ -38,6 +48,32 @@ TableFinishStep::TableFinishStep( output_stream = {input_stream_.header}; } + +void TableFinishStep::preExecute(ContextMutablePtr query_context) +{ + // FIXME(jiashuo): not thred-safe, don't `insert overwrite` same table or different table with same path in parallel + if (auto * hdfs_table = dynamic_cast(target->getStorage().get())) + { + if (auto * insert = dynamic_cast(query.get()); insert->is_overwrite) + { + Stopwatch watch; + query_context->setSetting("prefer_cnch_catalog", hdfs_table->settings.prefer_cnch_catalog.value); + hdfs_table->clear(query_context); + ProfileEvents::increment(ProfileEvents::TableFinishStepPreClearHDFSTableMicroseconds, watch.elapsedMicroseconds()); + } + } + else if (auto * s3_table = dynamic_cast(target->getStorage().get())) + { + if (auto * insert = dynamic_cast(query.get()); insert->is_overwrite) + { + Stopwatch watch; + query_context->setSetting("prefer_cnch_catalog", hdfs_table->settings.prefer_cnch_catalog.value); + s3_table->clear(query_context); + ProfileEvents::increment(ProfileEvents::TableFinishStepPreClearS3TableMicroseconds, watch.elapsedMicroseconds()); + } + } +} + std::shared_ptr TableFinishStep::copy(ContextPtr) const { return std::make_shared(input_streams[0], target, output_affected_row_count_symbol, query, insert_select_with_profiles); diff --git a/src/QueryPlan/TableFinishStep.h b/src/QueryPlan/TableFinishStep.h index 31c997cc951..7e1741bb817 100644 --- a/src/QueryPlan/TableFinishStep.h +++ b/src/QueryPlan/TableFinishStep.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -22,6 +23,8 @@ class TableFinishStep : public ITransformingStep { return Type::TableFinish; } + + void preExecute(ContextMutablePtr context); std::shared_ptr copy(ContextPtr) const override; void transformPipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings & settings) override; diff --git a/src/QueryPlan/tests/gtest_protobuf.cpp b/src/QueryPlan/tests/gtest_protobuf.cpp index 774dd3d4912..1446d54eef5 100644 --- a/src/QueryPlan/tests/gtest_protobuf.cpp +++ b/src/QueryPlan/tests/gtest_protobuf.cpp @@ -1130,9 +1130,9 @@ TEST_F(ProtobufTest, ReadStorageRowCountStep) auto storage_id = test_storage_ids[eng() % 3]; auto query = generateAST(eng); auto agg_desc = generateAggregateDescription(eng, 0); - auto num_rows = eng() % 1000; + // auto num_rows = eng() % 1000; auto is_final_agg = false; - auto s = std::make_shared(base_output_header, query, agg_desc, num_rows, is_final_agg); + auto s = std::make_shared(base_output_header, query, agg_desc, is_final_agg, storage_id); return s; }(); diff --git a/src/ResourceManagement/ResourceManagerClient.cpp b/src/ResourceManagement/ResourceManagerClient.cpp index 62f788b8216..b04d69a65a5 100644 --- a/src/ResourceManagement/ResourceManagerClient.cpp +++ b/src/ResourceManagement/ResourceManagerClient.cpp @@ -439,4 +439,17 @@ AggQueryQueueMap ResourceManagerClient::syncQueueDetails(VWQueryQueueMap vw_quer return res; } +void ResourceManagerClient::sendResourceRequest(const Protos::SendResourceRequestReq & request) +{ + brpc::Controller cntl; + Protos::SendResourceRequestResp response; + auto rpc_func = [this, &cntl, &request, &response](std::unique_ptr & stub_) { + stub_->sendResourceRequest(&cntl, &request, &response, nullptr); + + assertController(cntl); + RPCHelpers::checkResponse(response); + }; + + callToLeaderWrapper(response, rpc_func); +} } diff --git a/src/ResourceManagement/ResourceManagerClient.h b/src/ResourceManagement/ResourceManagerClient.h index cb718d474cb..1a7c35bf3ec 100644 --- a/src/ResourceManagement/ResourceManagerClient.h +++ b/src/ResourceManagement/ResourceManagerClient.h @@ -70,11 +70,15 @@ class ResourceManagerClient : public WithContext, public RpcLeaderClientBase void getWorkerGroups(const std::string & vw_name, std::vector & groups_data, std::optional & settings, std::atomic & last_settings_timestamp); bool reportResourceUsage(const WorkerNodeResourceData & data); + void sendResourceRequest(const Protos::SendResourceRequestResp & request); + void registerWorker(const WorkerNodeResourceData & data); void removeWorker(const String & worker_id, const String & vw_name, const String & group_id); AggQueryQueueMap syncQueueDetails(VWQueryQueueMap vw_query_queue_map , std::vector * deleted_vw_list); + void sendResourceRequest(const Protos::SendResourceRequestReq & request); + private: using Stub = Protos::ResourceManagerService_Stub; mutable RWLock leader_mutex = RWLockImpl::create(); diff --git a/src/ResourceManagement/ResourceManagerController.cpp b/src/ResourceManagement/ResourceManagerController.cpp index a7a96be4f85..309af8f25b6 100644 --- a/src/ResourceManagement/ResourceManagerController.cpp +++ b/src/ResourceManagement/ResourceManagerController.cpp @@ -43,6 +43,7 @@ namespace DB::ResourceManagement ResourceManagerController::ResourceManagerController(ContextPtr global_context_) : WithContext(global_context_), log(getLogger("ResourceManagerController")) { + resource_scheduler = std::make_unique(*this); resource_tracker = std::make_unique(*this); vw_manager = std::make_unique(*this); group_manager = std::make_unique(*this); diff --git a/src/ResourceManagement/ResourceManagerController.h b/src/ResourceManagement/ResourceManagerController.h index 0802e0dcdc7..f5ed65146b6 100644 --- a/src/ResourceManagement/ResourceManagerController.h +++ b/src/ResourceManagement/ResourceManagerController.h @@ -15,11 +15,12 @@ #pragma once -#include -#include -#include #include #include +#include +#include +#include +#include #include @@ -51,7 +52,7 @@ using CoordinateDecisions = std::vector; class ResourceManagerController : public WithContext, private boost::noncopyable { public: - ResourceManagerController(ContextPtr global_context_); + explicit ResourceManagerController(ContextPtr global_context_); ~ResourceManagerController(); Catalog::CatalogPtr getCnchCatalog(); @@ -60,6 +61,10 @@ class ResourceManagerController : public WithContext, private boost::noncopyable void initialize(); + auto & getResourceScheduler() + { + return *resource_scheduler; + } auto & getResourceTracker() { return *resource_tracker; } auto & getVirtualWarehouseManager() { return *vw_manager; } auto & getWorkerGroupManager() { return *group_manager; } @@ -84,6 +89,7 @@ class ResourceManagerController : public WithContext, private boost::noncopyable private: LoggerPtr log{nullptr}; + std::unique_ptr resource_scheduler; std::unique_ptr resource_tracker; std::unique_ptr vw_manager; std::unique_ptr group_manager; diff --git a/src/ResourceManagement/ResourceManagerServiceImpl.cpp b/src/ResourceManagement/ResourceManagerServiceImpl.cpp index 1bafaeeff42..b5054b08456 100644 --- a/src/ResourceManagement/ResourceManagerServiceImpl.cpp +++ b/src/ResourceManagement/ResourceManagerServiceImpl.cpp @@ -432,4 +432,24 @@ void ResourceManagerServiceImpl::syncQueueDetails( } } +void ResourceManagerServiceImpl::sendResourceRequest( + [[maybe_unused]] ::google::protobuf::RpcController * controller, + const ::DB::Protos::SendResourceRequestReq * request, + ::DB::Protos::SendResourceRequestResp * response, + ::google::protobuf::Closure * done) +{ + brpc::ClosureGuard done_guard(done); + try + { + if (!checkForLeader(response)) + return; + if (!rm_controller.getResourceScheduler().queue.push(*request)) + response->set_exception("enqueue failed"); + } + catch (...) + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); + RPCHelpers::handleException(response->mutable_exception()); + } +} } diff --git a/src/ResourceManagement/ResourceManagerServiceImpl.h b/src/ResourceManagement/ResourceManagerServiceImpl.h index ab3d6ea5474..430b66dc5c1 100644 --- a/src/ResourceManagement/ResourceManagerServiceImpl.h +++ b/src/ResourceManagement/ResourceManagerServiceImpl.h @@ -121,6 +121,12 @@ class ResourceManagerServiceImpl : public Protos::ResourceManagerService ::DB::Protos::SyncQueueDetailsResp * response, ::google::protobuf::Closure * done) override; + void sendResourceRequest( + ::google::protobuf::RpcController * controller, + const ::DB::Protos::SendResourceRequestReq * request, + ::DB::Protos::SendResourceRequestResp * response, + ::google::protobuf::Closure * done) override; + private: LoggerPtr log = getLogger("ResourceManagerServiceImpl"); ResourceManagerController & rm_controller; diff --git a/src/ResourceManagement/ResourceScheduler.cpp b/src/ResourceManagement/ResourceScheduler.cpp new file mode 100644 index 00000000000..9be10ca47b0 --- /dev/null +++ b/src/ResourceManagement/ResourceScheduler.cpp @@ -0,0 +1,72 @@ +#include "ResourceScheduler.h" + +#include +#include +#include +#include +#include +#include "Interpreters/DistributedStages/AddressInfo.h" + +namespace DB::ResourceManagement +{ + +ResourceScheduler::ResourceScheduler(ResourceManagerController & rm_controller_) + : rm_controller(rm_controller_), log(getLogger("ResourceScheduler")) +{ + // todo (wangtao.vip): add config for thread number. + schedule_pool = std::make_unique(16UL); + if (schedule_pool->trySchedule([this]() { scheduleResource(); })) + LOG_INFO(log, "Resource scheduler started."); + else + LOG_ERROR(log, "Start scheduling failed, please restart resource manager to fix it."); +} + +ResourceScheduler::~ResourceScheduler() +{ +} + +void ResourceScheduler::scheduleResource() +{ + // todo (wangtao.vip): decouple scheduling and sending + // todo (wangtao.vip): add timeout and exception handling + // todo (wangtao.vip): fill true logic + Protos::SendResourceRequestReq req; + while (queue.pop(req)) + { + AddressInfo server; + server.fillFromProto(req.server_addr()); + std::shared_ptr rpc_client + = RpcChannelPool::getInstance().getClient(extractExchangeHostPort(server), BrpcChannelPoolOptions::DEFAULT_CONFIG_KEY); + Protos::PlanSegmentManagerService_Stub manager(&rpc_client->getChannel()); + brpc::Controller * cntl = new brpc::Controller; + Protos::GrantResourceRequestReq * request = new Protos::GrantResourceRequestReq; + Protos::GrantResourceRequestResp * response = new Protos::GrantResourceRequestResp; + request->set_req_type(req.req_type()); + request->set_query_id(req.query_id()); + request->set_query_start_ts(req.query_start_ts()); + request->set_segment_id(req.segment_id()); + request->set_parallel_index(req.parallel_index()); + request->set_epoch(req.epoch()); + request->set_ok(true); + + LOG_DEBUG( + log, + "Granted resource request({} {}_{}_{}), result: {}", + request->req_type(), + request->query_id(), + request->segment_id(), + request->parallel_index(), + request->ok()); + + + // todo (wangtao.vip): add callback + manager.grantResourceRequest(cntl, request, response, nullptr); + } +} + +ContextPtr ResourceScheduler::getContext() const +{ + return rm_controller.getContext(); +} + +} // namespace DB::ResourceManagement diff --git a/src/ResourceManagement/ResourceScheduler.h b/src/ResourceManagement/ResourceScheduler.h new file mode 100644 index 00000000000..775baf7dc47 --- /dev/null +++ b/src/ResourceManagement/ResourceScheduler.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB::ResourceManagement +{ + +class ResourceManagerController; + + +class ResourceScheduler : public boost::noncopyable +{ + using ResourceRequestQueue = ConcurrentBoundedQueue<::DB::Protos::SendResourceRequestReq>; + +public: + explicit ResourceScheduler(ResourceManagerController & rm_controller_); + ~ResourceScheduler(); + + // todo (wangtao.vip): make it private + ResourceRequestQueue queue{10000}; + +private: + ContextPtr getContext() const; + void scheduleResource(); + + ResourceManagerController & rm_controller; + LoggerPtr log; + + + std::unique_ptr schedule_pool; +}; + +using ResourceSchedulerPtr = std::unique_ptr; + +} // namespace DB::ResourceManagement diff --git a/src/ResourceManagement/ResourceTracker.h b/src/ResourceManagement/ResourceTracker.h index 7251c2705ec..290aba93882 100644 --- a/src/ResourceManagement/ResourceTracker.h +++ b/src/ResourceManagement/ResourceTracker.h @@ -37,7 +37,7 @@ class ResourceManagerController; class ResourceTracker : public boost::noncopyable { public: - ResourceTracker(ResourceManagerController & rm_controller_); + explicit ResourceTracker(ResourceManagerController & rm_controller_); ~ResourceTracker(); std::vector loadWorkerNode(const String & vw_name, const std::vector & data); diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index 3a28e0c590c..7cda60a8b15 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -291,7 +291,6 @@ void MySQLHandler::run() SettingsChanges setting_changes; setting_changes.emplace_back("dialect_type", String("MYSQL")); connection_context->applySettingsChanges(setting_changes); - connection_context->setCurrentQueryId(fmt::format("mysql:{}", connection_id)); auto & client_info = connection_context->getClientInfo(); client_info.initial_query_id = client_info.current_query_id; diff --git a/src/Storages/DiskCache/Device.cpp b/src/Storages/DiskCache/Device.cpp index 8aa6b9f7059..09341cbb69e 100644 --- a/src/Storages/DiskCache/Device.cpp +++ b/src/Storages/DiskCache/Device.cpp @@ -463,7 +463,7 @@ namespace auto cur_time = getSteadyClock(); auto delay_ms = toMillis(cur_time - start_time).count(); if (delay_ms > static_cast(kIOTimeoutMs)) - LOG_ERROR(getLogger("Device"), + LOG_WARNING(getLogger("Device"), "[{}] IO timeout {}ms (submit +{}ms comp +{}ms): {}", parent.context.getName(), delay_ms, @@ -526,7 +526,7 @@ namespace delay_ms = toMillis(cur_time - comp_time).count(); if (delay_ms > static_cast(kIOTimeoutMs)) - LOG_ERROR(getLogger("Device"), + LOG_WARNING(getLogger("Device"), "[{}] IOReq timeout {}ms (comp +{}ms notify +{}ms): {}", context.getName(), delay_ms, diff --git a/src/Storages/DiskCache/Region.cpp b/src/Storages/DiskCache/Region.cpp index 1199f84d6ec..bf7a7087492 100644 --- a/src/Storages/DiskCache/Region.cpp +++ b/src/Storages/DiskCache/Region.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -182,4 +183,24 @@ void Region::readFromBuffer(UInt32 from_offset, size_t size, char *to) const memcpy(to, buffer->data() + from_offset, size); } +void Region::addHandle(std::shared_ptr &handle) +{ + std::lock_guard g{lock}; + handles.push_back(handle); +} + +void Region::resetHandles() +{ + std::lock_guard g{lock}; + for (auto &handle : handles) + handle->invalidRelAddress(); + handles.clear(); +} + +void Region::getHandles(std::vector> &handles_) +{ + std::lock_guard g{lock}; + handles_ = handles; +} + } diff --git a/src/Storages/DiskCache/Region.h b/src/Storages/DiskCache/Region.h index 1fe46aca9ee..39108f6a1a5 100644 --- a/src/Storages/DiskCache/Region.h +++ b/src/Storages/DiskCache/Region.h @@ -12,6 +12,11 @@ #include #include +namespace DB::NexusFSComponents +{ +class BlockHandle; +} + namespace DB::HybridCache { @@ -183,23 +188,9 @@ class Region // Returns the region id. RegionId id() const { return region_id; } - void addKey(UInt64 key) - { - std::lock_guard g{lock}; - keys.push_back(key); - } - - void resetKeys() - { - std::lock_guard g{lock}; - keys.clear(); - } - - void getKeys(std::vector &keys_) - { - std::lock_guard g{lock}; - keys_ = keys; - } + void addHandle(std::shared_ptr &handle); + void resetHandles(); + void getHandles(std::vector> &handles_); private: UInt32 activeOpenLocked() const; @@ -232,7 +223,7 @@ class Region UInt32 num_items{0}; std::unique_ptr buffer{nullptr}; - std::vector keys; + std::vector> handles; mutable TimedMutex lock{TimedMutex::Options(false)}; mutable ConditionVariable cond; diff --git a/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp b/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp index c762b669479..a1c0ec78068 100644 --- a/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp @@ -20,7 +20,6 @@ #include "Common/ProfileEvents.h" #include "Common/Stopwatch.h" -#include "Storages/HDFS/HDFSCommon.h" #include "Common/Exception.h" #include "common/sleep.h" @@ -116,7 +115,6 @@ static void doWithRetry(std::function func) struct ReadBufferFromByteHDFS::ReadBufferFromHDFSImpl { - HDFSConnectionParams hdfs_params; bool pread {false}; RemoteReadLog * remote_read_log; String remote_read_context; @@ -134,8 +132,7 @@ struct ReadBufferFromByteHDFS::ReadBufferFromHDFSImpl const HDFSConnectionParams & hdfs_params_, size_t read_until_position_, const ReadSettings & settings_) - : hdfs_params(hdfs_params_) - , pread(settings_.byte_hdfs_pread) + : pread(settings_.byte_hdfs_pread) , remote_read_log(settings_.remote_read_log) , remote_read_context(settings_.remote_read_context) , read_until_position(read_until_position_) @@ -263,14 +260,23 @@ ReadBufferFromByteHDFS::ReadBufferFromByteHDFS( off_t read_until_position_, std::optional file_size_) : ReadBufferFromFileBase(use_external_buffer_ ? 0 : read_settings.remote_fs_buffer_size, existing_memory_, alignment_, file_size_) + , hdfs_file_path(hdfs_file_path_) + , hdfs_params(hdfs_params_) + , read_until_position(read_until_position_) , settings(read_settings) - , impl(std::make_unique(hdfs_file_path_, hdfs_params_, read_until_position_, settings)) + , impl(nullptr) , total_network_throttler(settings.remote_throttler) { } ReadBufferFromByteHDFS::~ReadBufferFromByteHDFS() = default; +void ReadBufferFromByteHDFS::initImpl() +{ + chassert(!impl); + impl = std::make_unique(hdfs_file_path, hdfs_params, read_until_position, settings); +} + IAsynchronousReader::Result ReadBufferFromByteHDFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) { /** @@ -310,6 +316,8 @@ IAsynchronousReader::Result ReadBufferFromByteHDFS::readInto(char * data, size_t bool ReadBufferFromByteHDFS::nextImpl() { + if (!impl) + initImpl(); int bytes_read = impl->readImpl(internal_buffer.begin(), internal_buffer.size()); if (bytes_read) { @@ -339,6 +347,8 @@ off_t ReadBufferFromByteHDFS::seek(off_t offset_, int whence_) /// impl->getPosition() is the file position of the working buffer end /// Therefore working buffer corresponds to the file range /// [impl->getPosition() - working_buffer.size(), impl->getPosition()] + if (!impl) + initImpl(); if (!working_buffer.empty() && size_t(offset_) >= impl->getPosition() - working_buffer.size() && offset_ <= impl->getPosition()) @@ -356,6 +366,8 @@ off_t ReadBufferFromByteHDFS::seek(off_t offset_, int whence_) off_t ReadBufferFromByteHDFS::getPosition() { + if (!impl) + initImpl(); return impl->getPosition() - available(); } @@ -363,6 +375,8 @@ size_t ReadBufferFromByteHDFS::getFileSize() { if (file_size) return *file_size; + if (!impl) + initImpl(); file_size = impl->getFileSize(); return *file_size; @@ -370,20 +384,27 @@ size_t ReadBufferFromByteHDFS::getFileSize() String ReadBufferFromByteHDFS::getFileName() const { - return impl->hdfs_file_path; + return hdfs_file_path; } void ReadBufferFromByteHDFS::setReadUntilPosition(size_t position) { + if (!impl) + initImpl(); impl->setReadUntilPosition(position); } void ReadBufferFromByteHDFS::setReadUntilEnd() { + if (!impl) + initImpl(); impl->setReadUntilEnd(); } -size_t ReadBufferFromByteHDFS::getFileOffsetOfBufferEnd() const { +size_t ReadBufferFromByteHDFS::getFileOffsetOfBufferEnd() const +{ + if (!impl) + return 0; // file_offset=0 at the construction of ReadBufferFromHDFSImpl return impl->file_offset; } @@ -393,7 +414,7 @@ size_t ReadBufferFromByteHDFS::readBigAt(char * to, size_t n, size_t range_begin return 0; auto pooled_impl = impl_pool.get([this] (){ - return new ReadBufferFromHDFSImpl(impl->hdfs_file_path, impl->hdfs_params, 0, settings); + return new ReadBufferFromHDFSImpl(hdfs_file_path, hdfs_params, 0, settings); }); pooled_impl->seek(range_begin); diff --git a/src/Storages/HDFS/ReadBufferFromByteHDFS.h b/src/Storages/HDFS/ReadBufferFromByteHDFS.h index 54e8bea022d..cf87e4b1814 100644 --- a/src/Storages/HDFS/ReadBufferFromByteHDFS.h +++ b/src/Storages/HDFS/ReadBufferFromByteHDFS.h @@ -17,6 +17,7 @@ #include "Common/ObjectPool.h" #include "Common/config.h" +#include "Storages/HDFS/HDFSCommon.h" #if USE_HDFS #include "Core/Defines.h" @@ -65,6 +66,12 @@ struct ReadBufferFromHDFSImpl; bool isSeekCheap() override { return true; } private: + + void initImpl(); + + const String hdfs_file_path; + const HDFSConnectionParams hdfs_params; + const off_t read_until_position; ReadSettings settings; std::unique_ptr impl; ThrottlerPtr total_network_throttler; diff --git a/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp b/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp index 0574da386b9..58ea6e1430b 100644 --- a/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp +++ b/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp @@ -584,21 +584,35 @@ size_t MergeTreeCNCHDataDumper::writeProjectionPart( { auto & checksums_files = projection_part->checksums_ptr->files; reordered_checksums.reserve(checksums_files.size()); - for (const auto & col_name : projection_description.column_names) + auto add_to_reordered_checksums = [&](std::vector extensions) { - const auto & name = ISerialization::getFileNameForStream(col_name, {}); - for (const auto & extension : {".bin", ".mrk"}) + for (const auto & col_name : projection_description.column_names) { - if (auto it = checksums_files.find(name + extension); it != checksums_files.end() && !it->second.is_deleted) + const auto & name = ISerialization::getFileNameForStream(col_name, {}); + for (const auto & extension : extensions) { - reordered_checksums.push_back(&*it); - } - else - { - LOG_ERROR(log, "Fail to find column {} in projection {}", name + extension, projection_name); + if (auto it = checksums_files.find(name + extension); it != checksums_files.end() && !it->second.is_deleted) + { + reordered_checksums.push_back(&*it); + } + else + { + LOG_ERROR(log, "Fail to find column {} in projection {}", name + extension, projection_name); + } } } + }; + if (version == MERGE_TREE_CHCH_DATA_STORAGTE_CONCENTRATED_MARK_LAYOUT_VERSION) + { + add_to_reordered_checksums({".mrk"}); + add_to_reordered_checksums({".bin"}); + } + else + { + add_to_reordered_checksums({".bin", ".mrk"}); } + + for (auto & file : reordered_checksums) { file->second.file_offset = data_file_offset; diff --git a/src/Storages/NexusFS/HitsReinsertionPolicy.h b/src/Storages/NexusFS/HitsReinsertionPolicy.h deleted file mode 100644 index 80397f596c2..00000000000 --- a/src/Storages/NexusFS/HitsReinsertionPolicy.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace DB::NexusFSComponents -{ -class NexusFSHitsReinsertionPolicy : public HybridCache::BlockCacheReinsertionPolicy -{ -public: - explicit NexusFSHitsReinsertionPolicy(UInt8 hits_threshold_, const NexusFSIndex & index_) : hits_threshold{hits_threshold_}, index{index_} { } - - bool shouldReinsert(StringRef key) override - { - const auto lr = index.peek(makeHashKey(HybridCache::BufferView{key.size, reinterpret_cast(key.data)}).keyHash()); - return lr.isFound() && lr.getCurrentHits() >= hits_threshold; - } - -private: - const UInt8 hits_threshold{}; - - const NexusFSIndex & index; -}; -} diff --git a/src/Storages/NexusFS/NexusFS.cpp b/src/Storages/NexusFS/NexusFS.cpp index c3c845add93..61a4b6742c0 100644 --- a/src/Storages/NexusFS/NexusFS.cpp +++ b/src/Storages/NexusFS/NexusFS.cpp @@ -1,46 +1,65 @@ +#include + #include #include #include +#include #include #include #include -#include -#include #include -#include +#include #include -#include +#include #include #include -#include -#include +#include #include -#include "common/unit.h" #include #include +#include #include #include #include #include +#include +#include namespace ProfileEvents { -extern const Event NexusFSDiskCacheHit; -extern const Event NexusFSDiskCacheHitInflightInsert; -extern const Event NexusFSDiskCacheMiss; +extern const Event NexusFSHit; +extern const Event NexusFSHitInflightInsert; +extern const Event NexusFSMiss; +extern const Event NexusFSPreload; +extern const Event NexusFSDeepRetry; extern const Event NexusFSDiskCacheEvict; -extern const Event NexusFSDiskCachePreload; -extern const Event NexusFSDiskCacheLookupRetries; extern const Event NexusFSDiskCacheInsertRetries; extern const Event NexusFSDiskCacheError; extern const Event NexusFSDiskCacheBytesRead; extern const Event NexusFSDiskCacheBytesWrite; -extern const Event NexusFSMemoryBufferHit; -extern const Event NexusFSMemoryBufferMiss; -extern const Event NexusFSMemoryBufferError; -extern const Event NexusFSMemoryBufferBytesRead; +extern const Event NexusFSReadFromInsertCxt; +extern const Event NexusFSReadFromInsertCxtRetry; +extern const Event NexusFSReadFromInsertCxtDeepRetry; +extern const Event NexusFSReadFromInsertCxtBytesRead; +extern const Event NexusFSReadFromInsertCxtNonCopy; +extern const Event NexusFSReadFromInsertCxtNonCopyBytesRead; +extern const Event NexusFSReadFromDisk; +extern const Event NexusFSReadFromDiskRetry; +extern const Event NexusFSReadFromDiskDeepRetry; +extern const Event NexusFSReadFromDiskBytesRead; +extern const Event NexusFSReadFromBuffer; +extern const Event NexusFSReadFromBufferRetry; +extern const Event NexusFSReadFromBufferDeepRetry; +extern const Event NexusFSReadFromBufferBytesRead; +extern const Event NexusFSReadFromBufferNonCopy; +extern const Event NexusFSReadFromBufferNonCopyBytesRead; +extern const Event NexusFSReadFromSourceBytesRead; +extern const Event NexusFSReadFromSourceMicroseconds; +extern const Event NexusFSTimeout; +extern const Event NexusFSPrefetchToBuffer; +extern const Event NexusFSPrefetchToBufferBytesRead; } namespace DB::ErrorCodes @@ -49,6 +68,8 @@ extern const int INVALID_CONFIG_PARAMETER; extern const int NOT_IMPLEMENTED; extern const int TIMEOUT_EXCEEDED; extern const int CANNOT_OPEN_FILE; +extern const int CANNOT_FSTAT; +extern const int CANNOT_TRUNCATE_FILE; } namespace DB @@ -67,38 +88,33 @@ UInt64 alignUp(UInt64 num, UInt64 alignment) return alignDown(num + alignment - 1, alignment); } -File NexusFSConfig::openFile(const std::string & file_name, UInt64 size, bool truncate) +File NexusFSConfig::openFile(const std::string & file_name, UInt64 size, bool truncate, bool direct_io) { LOG_INFO(log, "create file: {} sie: {}, truncate: {}", file_name, size, truncate); if (file_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "file name is empty"); - // TODO: use DIRECT_IO int flags{O_RDWR | O_CREAT}; + if (direct_io) + flags |= O_DIRECT; File f = File(file_name.c_str(), flags); chassert(f.getFd() >= 0); struct stat file_stat; if (fstat(f.getFd(), &file_stat) < 0) - throw std::system_error(errno, std::system_category(), fmt::format("failed to get the file stat for file {}", file_name)); + throwFromErrno(fmt::format("failed to get the file stat for file {}", file_name), ErrorCodes::CANNOT_FSTAT); UInt64 cur_file_size = file_stat.st_size; if (truncate && cur_file_size < size) { if (::ftruncate(f.getFd(), size) < 0) - throw std::system_error( - errno, - std::system_category(), - fmt::format("ftruncate failed with requested size {}, current size {}", size, cur_file_size)); + throwFromErrno( + fmt::format("ftruncate failed with requested size {}, current size {}", size, cur_file_size), + ErrorCodes::CANNOT_TRUNCATE_FILE); - LOG_INFO( - log, - "cache file {} is ftruncated from {} bytes to {} bytes", - file_name, - cur_file_size, - size); + LOG_INFO(log, "cache file {} is ftruncated from {} bytes to {} bytes", file_name, cur_file_size, size); } return f; @@ -110,16 +126,24 @@ void NexusFSConfig::loadFromConfig(const Poco::Util::AbstractConfiguration & con cache_size = conf.getUInt64(config_name + ".cache_size", 10 * GiB); region_size = conf.getUInt64(config_name + ".region_size", 1 * MiB); segment_size = conf.getUInt64(config_name + ".segment_size", 128 * KiB); - alloc_align_size = conf.getUInt(config_name + ".alloc_align_size", 4096); + alloc_align_size = conf.getUInt(config_name + ".alloc_align_size", 512); io_align_size = conf.getUInt(config_name + ".io_align_size", 4096); stripe_size = conf.getUInt(config_name + ".stripe_size", 4096); + reader_threads = conf.getUInt(config_name + ".reader_threads", getNumberOfPhysicalCPUCores() >> 1); clean_regions_pool = conf.getUInt(config_name + ".clean_regions_pool", 4); clean_region_threads = conf.getUInt(config_name + ".clean_region_threads", 2); num_in_mem_buffers = conf.getUInt(config_name + ".num_in_mem_buffers", 8); - memory_cache_size = conf.getUInt64(config_name + ".memory_cache_size", 1 * GiB); + enable_memory_buffer = conf.getBool(config_name + ".enable_memory_buffer", false); + support_prefetch = conf.getBool(config_name + ".support_prefetch", true); + memory_buffer_size = conf.getUInt64(config_name + ".memory_buffer_size", 10 * GiB); + memory_buffer_cooling_percent = conf.getDouble(config_name + ".memory_buffer_cooling_percent", 0.1); + memory_buffer_freed_percent = conf.getDouble(config_name + ".memory_buffer_freed_percent", 0.05); timeout_ms = conf.getUInt(config_name + ".timeout_ms", 10000); + filemeta_gc_interval_s = conf.getUInt(config_name + ".filemeta_gc_interval_s", 300); bool use_memory_device = conf.getBool(config_name + ".use_memory_device", false); bool enable_async_io = conf.getBool(config_name + ".enable_async_io", false); + file_prefix = conf.getString(config_name + ".file_prefix", ""); + file_surfix = conf.getString(config_name + ".file_surfix", ""); double metadata_percentage = conf.getDouble(config_name + ".metadata_percentage", 0.01); metadata_size = alignUp(static_cast(metadata_percentage * cache_size), region_size); @@ -147,24 +171,16 @@ void NexusFSConfig::loadFromConfig(const Poco::Util::AbstractConfiguration & con File f; try { - f = openFile(path, cache_size, true); + f = openFile(path, cache_size, true, false); } - catch (const std::exception & e) + catch (const ErrnoException & e) { - LOG_ERROR( - &Poco::Logger::get("NexusFSConfig"), "Exception in openFile {}, error: {} errno: {}", path, e.what(), errno); + LOG_ERROR(getLogger("NexusFSConfig"), "Exception in openFile {}, error: {} errno: {}", path, e.what(), e.getErrno()); throw; } file_vec.push_back(std::move(f)); } - device = createDirectIoFileDevice( - std::move(file_vec), - cache_size, - io_align_size, - stripe_size, - 0, - io_engine, - q_depth); + device = createDirectIoFileDevice(std::move(file_vec), cache_size, io_align_size, stripe_size, 0, io_engine, q_depth); } validate(); @@ -179,16 +195,17 @@ NexusFSConfig & NexusFSConfig::validate() if (cache_size <= 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "invalid size"); if (cache_size % region_size != 0) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, - fmt::format("Cache size is not aligned to region size! cache size: {} region size: {]}", cache_size, region_size)); + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + fmt::format("Cache size is not aligned to region size! cache size: {} region size: {}", cache_size, region_size)); if (getNumberRegions() < clean_regions_pool) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "not enough space on device"); if (num_in_mem_buffers == 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "there must be at least one in-mem buffers"); if (num_priorities == 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "allocator must have at least one priority"); - - reinsertion_config.validate(); + if (reader_threads == 0) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "reader threads must greater than 0"); return *this; } @@ -198,10 +215,12 @@ NexusFS::NexusFS(NexusFSConfig && config) : serialized_config{serializeConfig(config)} , device{std::move(config.device)} , alloc_align_size{config.alloc_align_size} - , region_size{config.region_size} , metadata_size(config.metadata_size) , segment_size(config.segment_size) , timeout_ms(config.timeout_ms) + , file_prefix(config.file_prefix) + , file_surfix(config.file_surfix) + , index(getFilePrefix(), getFileSurfix(), getSegmentSize()) // , num_priorities{config.num_priorities} // , check_expired{std::move(config.check_expired)} // , destructor_callback{std::move(config.destructor_callback)} @@ -221,14 +240,32 @@ NexusFS::NexusFS(NexusFSConfig && config) config.num_priorities, config.in_mem_buf_flush_retry_limit} , allocator{region_manager, config.num_priorities} - , reinsertion_policy{makeReinsertionPolicy(config.reinsertion_config)} - , enable_segment_cache(false) - // , segment_cache(config.memory_cache_size / segment_size) + , enable_buffer(config.enable_memory_buffer) + , support_prefetch(config.support_prefetch) + , buffer_manager( + config.enable_memory_buffer ? + BufferManager::initInstance( + config.memory_buffer_size, + config.segment_size, + config.filemeta_gc_interval_s, + config.memory_buffer_cooling_percent, + config.memory_buffer_freed_percent, + region_manager, + index) + : nullptr) { + if (support_prefetch) + { + for (uint32_t i = 0; i < config.reader_threads; i++) + { + auto name = fmt::format("NexusFS_read_worker_{}", i); + reader_workers.emplace_back(std::make_unique(name)); + } + } LOG_TRACE(log, "NexusFS created"); } -void NexusFS::preload(const String &file, const OffsetAndSizeVector &offsets_and_sizes, std::unique_ptr &source) +void NexusFS::preload(const String & file, const OffsetAndSizeVector & offsets_and_sizes, std::unique_ptr & source) { std::unordered_set segment_ids; for (const auto & [offset, size] : offsets_and_sizes) @@ -240,166 +277,539 @@ void NexusFS::preload(const String &file, const OffsetAndSizeVector &offsets_and } LOG_TRACE(log, "preload {} segments from {}", segment_ids.size(), file); - ProfileEvents::increment(ProfileEvents::NexusFSDiskCachePreload, segment_ids.size()); + ProfileEvents::increment(ProfileEvents::NexusFSPreload, segment_ids.size()); for (auto id : segment_ids) { String segment_name = getSegmentName(file, id); - off_t offset_in_source = getOffsetInSourceFile(id); - HashedKey key(segment_name); - std::shared_ptr cxt; - load(key, offset_in_source, source, cxt); + open(segment_name, file, id, source); } } -NexusFSIndex::LookupResult NexusFS::load(const HybridCache::HashedKey &key, off_t offset_in_source, std::unique_ptr &source, std::shared_ptr &insert_cxt) +std::tuple, std::shared_ptr, UInt64> +NexusFS::open(const String & segment_name, const String & file, const UInt64 segment_id, std::unique_ptr & source) { - LOG_TRACE(log, "try find {}({}) from index", key.key(), key.keyHash()); + LOG_TRACE(log, "try to find {} from index", segment_name); - auto lr = index.lookup(key.keyHash()); - if (!lr.isFound()) + auto seq_number = region_manager.getSeqNumber(); + auto handle = index.lookup(file, segment_id); + std::shared_ptr insert_cxt = nullptr; + if (!handle) { - LOG_TRACE(log, "{}({}) not find, read from source and insert to cache", key.key(), key.keyHash()); + LOG_TRACE(log, "{} not found, check InFlightInserts", segment_name); bool is_newly_created; - insert_cxt = in_flight_inserts.getOrCreateContext(key.keyHash(), is_newly_created); + insert_cxt = in_flight_inserts.getOrCreateContext(segment_name, is_newly_created); if (is_newly_created) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheMiss); + // double check index + handle = index.lookup(file, segment_id); + if (handle) + { + in_flight_inserts.removeContext(segment_name); + insert_cxt.reset(); + LOG_TRACE(log, "{} already inserted to index", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + else + { + LOG_TRACE(log, "create InsertCxt for {}, read from source and insert to cache", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSMiss); + + { + std::lock_guard lock(insert_cxt->mutex); + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + + insert_cxt->buffer = Buffer(segment_size); + off_t offset_in_source = getOffsetInSourceFile(segment_id); + size_t bytes_read + = source->readBigAt(reinterpret_cast(insert_cxt->buffer.data()), segment_size, offset_in_source); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset_in_source); + + insert_cxt->buffer.shrink(bytes_read); + insert_cxt->ready = true; + insert_cxt->cv.notify_all(); + } + + try + { + auto get_file_and_segment_size = [&source, this]() { return std::make_pair(source->getFileSize(), segment_size); }; + handle = insert(file, segment_id, insert_cxt->buffer.view(), get_file_and_segment_size); + } + catch (Exception & e) + { + in_flight_inserts.removeContext(segment_name); + throw e; + } + in_flight_inserts.removeContext(segment_name); + } + } + else + { + LOG_TRACE(log, "found InsertCxt for {}, wait and read from InsertCxt", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSHitInflightInsert); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + } + else + { + LOG_TRACE(log, "{} found, {}", segment_name, handle->toString()); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + return std::make_tuple(handle, insert_cxt, seq_number); +} - insert_cxt->buffer = Buffer(segment_size); - size_t bytes_read = source->readBigAt(reinterpret_cast(insert_cxt->buffer.data()), segment_size, offset_in_source); - LOG_TRACE(log, "read {} bytes from source", bytes_read); +std::pair +NexusFS::readFromInsertCxtInternal(std::shared_ptr & cxt, const off_t offset_in_segment, const size_t max_size, char * to) const +{ + { + std::unique_lock lock(cxt->mutex); + auto timeout = std::chrono::system_clock::now() + std::chrono::milliseconds(timeout_ms / 3); + if (!cxt->cv.wait_until(lock, timeout, [&] { return cxt->ready; })) + { + // timeout, deep retry + return {OpResult::DEEP_RETRY, 0}; + } + } - insert_cxt->buffer.shrink(bytes_read); - insert_cxt->ready = true; + size_t size = getReadSizeInSegment(offset_in_segment, cxt->buffer.size(), max_size); + if (size == 0) + return {OpResult::SUCCESS, 0}; - lr = insert(key, insert_cxt->buffer.view()); - in_flight_inserts.removeContext(key.keyHash()); + memcpy(to, cxt->buffer.data() + offset_in_segment, size); - //TODO: insert into memory cache - } - else + return {OpResult::SUCCESS, size}; +} + +std::pair +NexusFS::readFromInsertCxtInternal(std::shared_ptr & cxt, const off_t offset_in_segment, const size_t max_size) const +{ + { + std::unique_lock lock(cxt->mutex); + auto timeout = std::chrono::system_clock::now() + std::chrono::milliseconds(timeout_ms / 3); + if (!cxt->cv.wait_until(lock, timeout, [&] { return cxt->ready; })) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHitInflightInsert); + // timeout, deep retry + return {OpResult::DEEP_RETRY, NexusFSBufferWithHandle()}; } } - else + + size_t size = getReadSizeInSegment(offset_in_segment, cxt->buffer.size(), max_size); + if (size == 0) + return {OpResult::SUCCESS, NexusFSBufferWithHandle()}; + + NexusFSBufferWithHandle bwh; + bwh.buffer + = std::make_unique>(size, reinterpret_cast(cxt->buffer.data() + offset_in_segment), 0); + bwh.buffer->buffer().resize(size); + bwh.insert_cxt = cxt; + + return {OpResult::SUCCESS, std::move(bwh)}; +} + +std::pair NexusFS::readFromBufferInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size, char * to) +{ + chassert(buffer_manager); + auto [op_result, buffer] = buffer_manager->pin(handle, seq_number); + if (op_result == OpResult::SUCCESS) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHit); + LOG_TRACE( + log, + "{} pinned, going to copy {} bytes from buffer({})", + handle->toString(), + size, + reinterpret_cast(buffer + offset_in_segment)); + chassert(buffer); + memcpy(to, reinterpret_cast(buffer + offset_in_segment), size); + handle->unpin(); + return {OpResult::SUCCESS, size}; + } + return {op_result, 0}; +} + +std::pair NexusFS::readFromBufferInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size) +{ + chassert(buffer_manager); + auto [op_result, buffer] = buffer_manager->pin(handle, seq_number); + if (op_result == OpResult::SUCCESS) + { + LOG_TRACE( + log, + "{} pinned, return a buffer({}) with {} bytes", + handle->toString(), + reinterpret_cast(buffer + offset_in_segment), + size); + chassert(buffer); + NexusFSBufferWithHandle bwh; + bwh.handle = handle; + bwh.buffer = std::make_unique>(size, reinterpret_cast(buffer + offset_in_segment), 0); + bwh.buffer->buffer().resize(size); + return {OpResult::SUCCESS, std::move(bwh)}; + } + return {op_result, NexusFSBufferWithHandle()}; +} + +std::pair NexusFS::readFromDiskInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size, char * to) +{ + if (!handle->isRelAddressValid()) + return {OpResult::DEEP_RETRY, 0}; + + auto addr = handle->getRelAddress(); + chassert(addr.rid().valid()); + RegionDescriptor desc = region_manager.openForRead(addr.rid(), seq_number); + switch (desc.getStatus()) + { + case OpenStatus::Retry: + if (region_manager.getSeqNumber() != seq_number) + return {OpResult::DEEP_RETRY, 0}; + else + return {OpResult::RETRY, 0}; + case OpenStatus::Error: + return {OpResult::ERROR, 0}; + case OpenStatus::Ready: + addr = addr.add(offset_in_segment); + size_t bytes_read = readEntry(desc, addr, size, to); + LOG_TRACE(log, "read {} bytes from disk, addr=<{},{}>", bytes_read, addr.rid().index(), addr.offset()); + if (bytes_read > 0) + { + region_manager.touch(addr.rid()); + } + region_manager.close(std::move(desc)); + return {OpResult::SUCCESS, bytes_read}; } - return lr; + return {OpResult::ERROR, 0}; // this line should not be reached } -size_t NexusFS::read(const String &file, const off_t offset, const size_t max_size, std::unique_ptr &source, char *to) +std::pair NexusFS::readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size, char * to) + const +{ + auto [op_result, bytes_read] = readFromInsertCxtInternal(cxt, offset_in_segment, max_size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtRetry); + std::tie(op_result, bytes_read) = readFromInsertCxtInternal(cxt, offset_in_segment, max_size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxt); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromInsertCxt failed when reading {}, cxt={}", + segment_name, + reinterpret_cast(cxt.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtDeepRetry); + return {true, 0}; + } +} + +std::pair NexusFS::readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const +{ + auto [op_result, bwh] = readFromInsertCxtInternal(cxt, offset_in_segment, max_size); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtRetry); + std::tie(op_result, bwh) = readFromInsertCxtInternal(cxt, offset_in_segment, max_size); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtNonCopy); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtNonCopyBytesRead, bwh.getSize()); + return {false, std::move(bwh)}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, NexusFSBufferWithHandle()}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromInsertCxt(non-copy) failed when reading {}, cxt={}", + segment_name, + reinterpret_cast(cxt.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtDeepRetry); + return {true, NexusFSBufferWithHandle()}; + } +} + +std::pair NexusFS::readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to) +{ + auto [op_result, bytes_read] = readFromBufferInternal(handle, seq_number, offset_in_segment, size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferRetry); + std::tie(op_result, bytes_read) = readFromBufferInternal(handle, seq_number, offset_in_segment, size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBuffer); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromBuffer failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferDeepRetry); + return {true, 0}; + } +} + +std::pair NexusFS::readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size) +{ + auto [op_result, bwh] = readFromBufferInternal(handle, seq_number, offset_in_segment, size); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferRetry); + std::tie(op_result, bwh) = readFromBufferInternal(handle, seq_number, offset_in_segment, size); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferNonCopy); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferNonCopyBytesRead, bwh.getSize()); + return {false, std::move(bwh)}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, NexusFSBufferWithHandle()}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromBuffer(non-copy) failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferDeepRetry); + return {true, NexusFSBufferWithHandle()}; + } +} + +std::pair NexusFS::readFromDisk( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to) +{ + auto [op_result, bytes_read] = readFromDiskInternal(handle, seq_number, offset_in_segment, size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskRetry); + std::tie(op_result, bytes_read) = readFromDiskInternal(handle, seq_number, offset_in_segment, size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDisk); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::CANNOT_OPEN_FILE, + "readFromDisk failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskDeepRetry); + return {true, 0}; + } +} + +size_t +NexusFS::read(const String & file, const off_t offset, const size_t max_size, std::unique_ptr & source, char * to) { UInt64 segment_id = getSegmentId(offset); String segment_name = getSegmentName(file, segment_id); - off_t offset_in_source = getOffsetInSourceFile(segment_id); off_t offset_in_segment = getOffsetInSegment(offset); - HashedKey key(segment_name); Stopwatch watch; UInt32 num_tries = 0; - for (; watch.elapsedMilliseconds() < timeout_ms; num_tries++) + while (watch.elapsedMilliseconds() < timeout_ms) { - if (num_tries > 0) - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheLookupRetries); + num_tries++; + if (num_tries > 1) + ProfileEvents::increment(ProfileEvents::NexusFSDeepRetry); - std::shared_ptr cxt; - const auto seq_number = region_manager.getSeqNumber(); - auto lr = load(key, offset_in_source, source, cxt); + auto [handle, cxt, seq_number] = open(segment_name, file, segment_id, source); - // read from InsertCxt if (cxt) { - Stopwatch watch_cxt; - while (!cxt->ready && watch_cxt.elapsedMilliseconds() < timeout_ms) - { - std::this_thread::yield(); - } - - if (!cxt->ready) - { - LOG_WARNING(log, "stop waiting for InsertCxt to get ready, because of timeout ({}ms)", timeout_ms); - break; - } - - size_t buffer_size = cxt->buffer.size(); - if (buffer_size == 0 || buffer_size <= static_cast(offset_in_segment)) - { - return 0; - } - size_t size = getReadSizeInSegment(offset_in_segment, buffer_size, max_size); - chassert(size > 0); - memcpy(to, cxt->buffer.data() + offset_in_segment, size); - - return size; + // read from InsertCxt + auto [should_retry, bytes_read] = readFromInsertCxt(watch, segment_name, cxt, offset_in_segment, max_size, to); + if (should_retry) + continue; + else + return bytes_read; } - chassert(lr.isFound()); - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHit); - - if (lr.getSize() == 0) + chassert(handle); + size_t size = getReadSizeInSegment(offset_in_segment, handle->getSize(), max_size); + if (size == 0) return 0; - // read from memory buffer or disk - if (enable_segment_cache) + if (enable_buffer) { - // // load into memory cache and read - // auto callback = [this, &lr]{ return std::make_shared(loadToMemoryCache(lr)); }; - // auto &handle = segment_cache.getOrSet(key.keyHash(), callback); - // auto view = handle.pinMemoryBuffer(); - // size_t size = getReadSizeInSegment(offset_in_segment, lr.getSize(), max_size); - // ProfileEvents::increment(ProfileEvents::NexusFSMemoryBufferBytesRead); - // memcpy(to, view.data() + offset_in_segment, size); - // handle.unpinMemoryBuffer(); - // return size; - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "memory buffer of NexusFS is not implemented"); + // read from memroy buffer + auto [should_retry, bytes_read] = readFromBuffer(watch, segment_name, handle, seq_number, offset_in_segment, size, to); + if (should_retry) + continue; + else + return bytes_read; } else { - // directly read disk file - ProfileEvents::increment(ProfileEvents::NexusFSMemoryBufferMiss); + // read from disk + auto [should_retry, bytes_read] = readFromDisk(watch, segment_name, handle, seq_number, offset_in_segment, size, to); + if (should_retry) + continue; + else + return bytes_read; + } + } - auto addr = lr.getAddress(); - RegionDescriptor desc = region_manager.openForRead(addr.rid(), seq_number); - if (desc.getStatus() == OpenStatus::Retry) - { - // retry, go back to the for loop + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, watch.elapsedMilliseconds()); + size_t bytes_read = source->readBigAt(to, max_size, offset); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset); + return bytes_read; +} + +NexusFSBufferWithHandle +NexusFS::read(const String & file, const off_t offset, const size_t max_size, std::unique_ptr & source) +{ + UInt64 segment_id = getSegmentId(offset); + String segment_name = getSegmentName(file, segment_id); + off_t offset_in_segment = getOffsetInSegment(offset); + + Stopwatch watch; + UInt32 num_tries = 0; + while (watch.elapsedMilliseconds() < timeout_ms) + { + num_tries++; + if (num_tries > 1) + ProfileEvents::increment(ProfileEvents::NexusFSDeepRetry); + + auto [handle, cxt, seq_number] = open(segment_name, file, segment_id, source); + + if (cxt) + { + // read from InsertCxt + auto [should_retry, bwh] = readFromInsertCxt(watch, segment_name, cxt, offset_in_segment, max_size); + if (should_retry) continue; - } - if (desc.getStatus() != OpenStatus::Ready) + else + return std::move(bwh); + } - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "fail to open region for read"); + chassert(handle); + size_t size = getReadSizeInSegment(offset_in_segment, handle->getSize(), max_size); + if (size == 0) + return NexusFSBufferWithHandle(); - addr = addr.add(offset_in_segment); - size_t size = getReadSizeInSegment(offset_in_segment, lr.getSize(), max_size); - size_t bytes_read = readEntry(desc, addr, size, to); - LOG_TRACE(log, "read {} bytes from {}({})", bytes_read, key.key(), key.keyHash()); - if (bytes_read > 0) + // read from memroy buffer + chassert(enable_buffer); + auto [should_retry, bwh] = readFromBuffer(watch, segment_name, handle, seq_number, offset_in_segment, size); + if (should_retry) + continue; + else + return std::move(bwh); + } + + LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, watch.elapsedMilliseconds()); + NexusFSBufferWithHandle bwh; + bwh.buffer = std::make_unique>(max_size, nullptr, 0); + + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + size_t bytes_read = source->readBigAt(bwh.buffer->position(), max_size, offset); + bwh.buffer->buffer().resize(bytes_read); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset); + return bwh; +} + +std::future +NexusFS::prefetchToBuffer(const String & file, off_t offset, size_t max_size, std::unique_ptr & source) +{ + auto promise = std::make_shared>(); + + if (support_prefetch) + { + getReadWorker().addTaskRemote([&file, &source, this, offset, max_size, promise]() { + try { - region_manager.touch(addr.rid()); + auto bwh = read(file, offset, max_size, source); + ProfileEvents::increment(ProfileEvents::NexusFSPrefetchToBuffer); + ProfileEvents::increment(ProfileEvents::NexusFSPrefetchToBufferBytesRead, bwh.getSize()); + promise->set_value(std::move(bwh)); } - region_manager.close(std::move(desc)); - - return bytes_read; - } + catch (Exception & e) + { + promise->set_exception(std::make_exception_ptr(e)); + } + }); } + else + promise->set_exception( + std::make_exception_ptr(Exception(ErrorCodes::NOT_IMPLEMENTED, "support_prefetch = false, prefetchToBuffer is not supported"))); - LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, timeout_ms); - size_t bytes_read = source->readBigAt(to, max_size, offset); - LOG_TRACE(log, "read {} bytes from source", bytes_read); - return bytes_read; + return promise->get_future(); } -NexusFSIndex::LookupResult NexusFS::insert(const HashedKey &key, BufferView buf_view) +std::shared_ptr NexusFS::insert( + const String & file, UInt64 segment_id, BufferView buf_view, std::function()> get_file_and_segment_size) { size_t size = buf_view.size(); UInt32 aligned_size = alignedSize(size); if (size == 0) - return index.insert(key.keyHash(), RelAddress(), 0); + { + auto handle = std::make_shared(RelAddress(), 0); + index.insert(file, segment_id, handle, get_file_and_segment_size); + return handle; + } chassert(size > 0); chassert(size <= segment_size); @@ -415,77 +825,79 @@ NexusFSIndex::LookupResult NexusFS::insert(const HashedKey &key, BufferView buf_ if (desc.getStatus() == OpenStatus::Error) { ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheError); - LOG_ERROR(log, "failed to insert {}({}), size={}", key.key(), key.keyHash(), slot_size); - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "failed to insert {}({}), size={}", key.key(), key.keyHash(), slot_size); + LOG_ERROR(log, "failed to insert {}#{}, size={}", file, segment_id, slot_size); + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "failed to insert {}#{}, size={}", file, segment_id, slot_size); } - writeEntry(addr, slot_size, key, buf_view); - auto lr = index.insert(key.keyHash(), addr, size); + chassert(addr.offset() + slot_size <= region_manager.regionSize()); + chassert(slot_size % alloc_align_size == 0ULL); + + auto handle = std::make_shared(addr, size); + writeEntry(handle, buf_view); + index.insert(file, segment_id, handle, get_file_and_segment_size); + + LOG_TRACE( + log, + "create {} for {}#{}, write to disk addr=<{},{}>, size={}, slot_size={}", + handle->toString(), + file, + segment_id, + addr.rid().index(), + addr.offset(), + buf_view.size(), + slot_size); + allocator.close(std::move(desc)); - return lr; + return handle; } -void NexusFS::writeEntry(RelAddress addr, UInt32 slot_size, const HashedKey &key, BufferView value) +void NexusFS::writeEntry(std::shared_ptr & handle, HybridCache::BufferView value) { - chassert(addr.offset() + slot_size <= region_manager.regionSize()); - chassert(slot_size % alloc_align_size == 0ULL); - - LOG_TRACE(log, "writeEntry rid={}, off={}, key={}({}), size={} ", addr.rid().index(), addr.offset(), key.key(), key.keyHash(), slot_size); + auto addr = handle->getRelAddress(); + chassert(addr.offset() + value.size() <= region_manager.regionSize()); auto rid = addr.rid(); auto & region = region_manager.getRegion(rid); region.writeToBuffer(addr.offset(), value); - region.addKey(key.keyHash()); + region.addHandle(handle); + num_segments++; ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesWrite, value.size()); } -size_t NexusFS::readEntry(const RegionDescriptor &rdesc, RelAddress addr, UInt32 size, char *to) +size_t NexusFS::readEntry(const RegionDescriptor & rdesc, RelAddress addr, UInt32 size, char * to) { chassert(addr.offset() + size <= region_manager.regionSize()); - LOG_TRACE(log, "readEntry rid={}, off={}, size={} ", addr.rid().index(), addr.offset(), size); + LOG_TRACE(log, "read from disk addr=<{},{}>, size={} ", addr.rid().index(), addr.offset(), size); ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesRead, size); return region_manager.read(rdesc, addr, size, to); } -std::shared_ptr NexusFS::makeReinsertionPolicy(const BlockCacheReinsertionConfig & reinsertion_config) -{ - auto hits_threshold = reinsertion_config.getHitsThreshold(); - if (hits_threshold) - return std::make_shared(hits_threshold, index); - - auto pct_threshold = reinsertion_config.getPctThreshold(); - if (pct_threshold) - return std::make_shared(pct_threshold); - - return reinsertion_config.getCustomPolicy(); -} - -UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView buffer) +UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView /*buffer*/) { UInt32 eviction_count = 0; auto & region = region_manager.getRegion(rid); - std::vector keys; - region.getKeys(keys); - chassert(region.getNumItems() == keys.size()); + std::vector> handles; + region.getHandles(handles); + chassert(region.getNumItems() == handles.size()); - for (auto key : keys) + for (auto & handle : handles) { - auto lr = index.lookup(key); - if (!lr.isFound()) + if (!handle) { - LOG_ERROR(log, "reclaim a key {} in from region {}, but it does not exist in index", key, rid.index()); + LOG_ERROR(log, "reclaim a handle in from region {}, but it is null", rid.index()); continue; } - auto addr = lr.getAddress(); - auto size = lr.getSize(); - BufferView value{size, buffer.data() + addr.offset()}; + // auto addr = handle->getRelAddress(); + // chassert(addr.rid().valid()); + // auto size = handle->getSize(); + // BufferView value{size, buffer.data() + addr.offset()}; - const auto reinsertion_res = reinsertOrRemoveItem(key, value, size, addr); + const auto reinsertion_res = reinsertOrRemoveItem(handle); switch (reinsertion_res) { case ReinsertionRes::kEvicted: @@ -501,10 +913,9 @@ UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView buffer) // if (destructor_callback && reinsertion_res == ReinsertionRes::kEvicted) // destructor_callback(key, value, DestructorEvent::Recycled); - } - region.resetKeys(); + region.resetHandles(); chassert(region.getNumItems() >= eviction_count); return eviction_count; } @@ -513,53 +924,35 @@ void NexusFS::onRegionCleanup(RegionId rid, BufferView /*buffer*/) { UInt32 eviction_count = 0; auto & region = region_manager.getRegion(rid); - std::vector keys; - region.getKeys(keys); - chassert(region.getNumItems() == keys.size()); + std::vector> handles; + region.getHandles(handles); + chassert(region.getNumItems() == handles.size()); - for (auto key : keys) + for (auto & handle : handles) { - auto lr = index.lookup(key); - if (!lr.isFound()) + if (!handle) { - LOG_ERROR(log, "reclaim a key {} in from region {}, but it does not exist in index", key, rid.index()); + LOG_ERROR(log, "cleanup a handle in from region {}, but it is null", rid.index()); continue; } - auto addr = lr.getAddress(); + // auto addr = handle->getRelAddress(); + // chassert(addr.rid().valid()); - auto remove_res = removeItem(key, addr); - - if (remove_res) - eviction_count++; + removeItem(handle); + eviction_count++; // if (destructor_callback && remove_res) // destructor_callback(key, value, DestructorEvent::Recycled); } - region.resetKeys(); + region.resetHandles(); chassert(region.getNumItems() >= eviction_count); } -NexusFS::ReinsertionRes NexusFS::reinsertOrRemoveItem(UInt64 key, BufferView /*value*/, UInt32 /*entry_size*/, RelAddress addr) +NexusFS::ReinsertionRes NexusFS::reinsertOrRemoveItem(std::shared_ptr & handle) { - auto remove_item = [this, key, addr](bool /*expired*/) { - if (index.removeIfMatch(key, addr)) - { - // if (expired) - // ProfileEvents::increment(ProfileEvents::BlockCacheEvictionExpiredCount); - return ReinsertionRes::kEvicted; - } - return ReinsertionRes::kRemoved; - }; - - const auto lr = index.peek(key); - if (!lr.isFound() || lr.getAddress() != addr) - { - // ProfileEvents::increment(ProfileEvents::BlockCacheEvictionLookupMissCount); - return ReinsertionRes::kRemoved; - } - - return remove_item(true); + removeItem(handle); + return ReinsertionRes::kRemoved; } Protos::NexusFSConfig NexusFS::serializeConfig(const NexusFSConfig & config) @@ -571,63 +964,74 @@ Protos::NexusFSConfig NexusFS::serializeConfig(const NexusFSConfig & config) serialized_config.set_alloc_align_size(config.alloc_align_size); serialized_config.set_region_size(config.region_size); serialized_config.set_segment_size(config.segment_size); - + serialized_config.set_file_prefix(config.file_prefix); + serialized_config.set_file_surfix(config.file_surfix); + return serialized_config; } -bool NexusFS::removeItem(UInt64 key, RelAddress addr) +void NexusFS::removeItem(std::shared_ptr & handle) { - return index.removeIfMatch(key, addr); + num_segments--; + handle->invalidRelAddress(); + LOG_TRACE(log, "invalid {} because of removeItem", handle->toString()); } void NexusFS::persist() { - LOG_INFO(log, "Starting block cache persist"); + LOG_INFO(log, "Starting NexusFS persist"); auto stream = createMetadataOutputStream(*device, metadata_size); Protos::NexusFSConfig config = serialized_config; - config.set_alloc_align_size(alloc_align_size); - config.set_region_size(region_size); - config.set_segment_size(segment_size); - config.set_reinsertion_policy_enabled(reinsertion_policy != nullptr); google::protobuf::io::CodedOutputStream ostream(stream.get()); google::protobuf::util::SerializeDelimitedToCodedStream(config, &ostream); region_manager.persist(&ostream); index.persist(&ostream); - LOG_INFO(log, "Finished block cache persist"); + LOG_INFO(log, "Finished NexusFS persist"); } bool NexusFS::recover() { - LOG_INFO(log, "Starting block cache recovery"); + LOG_INFO(log, "Starting NexusFS recovery"); reset(); + bool recovered = false; try { auto stream = createMetadataInputStream(*device, metadata_size); + if (!stream) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Failed to createMetadataInputStream"); Protos::NexusFSConfig config; google::protobuf::io::CodedInputStream istream(stream.get()); google::protobuf::util::ParseDelimitedFromCodedStream(&config, &istream, nullptr); if (config.cache_size() != serialized_config.cache_size() || config.metadata_size() != serialized_config.metadata_size() || config.region_size() != serialized_config.region_size() || config.segment_size() != serialized_config.segment_size() - || config.version() != serialized_config.version() || config.alloc_align_size() != serialized_config.alloc_align_size()) + || config.version() != serialized_config.version() || config.alloc_align_size() != serialized_config.alloc_align_size() + || config.file_prefix() != serialized_config.file_prefix() || config.file_surfix() != serialized_config.file_surfix()) { LOG_ERROR(log, "Recovery config: {}", config.DebugString()); throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Recovery config does not match cache config"); } region_manager.recover(&istream); - index.recover(&istream); + index.recover(&istream, region_manager, num_segments); + + // successful recovery, invalid current metadata + auto output_stream = createMetadataOutputStream(*device, metadata_size); + if (!output_stream) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Failed to createMetadataOutputStream"); + recovered = output_stream->invalidate(); } catch (const std::exception & e) { LOG_ERROR(log, "Exception: {}", e.what()); - LOG_ERROR(log, "Failed to recover block cache. Resetting cache."); + LOG_ERROR(log, "Failed to recover NexusFS. Resetting cache."); reset(); return false; } - LOG_INFO(log, "Finished block cache recovery"); - return true; + if (recovered) + LOG_INFO(log, "Finished NexusFS recovery. Recover {} inodes, {} files, {} segments", index.getNumInodes(), index.getNumFileMetas(), num_segments); + return recovered; } @@ -669,33 +1073,4 @@ bool NexusFS::shutDown() return true; } -// std::shared_ptr NexusFS::loadToMemoryCache(const NexusFSIndex::LookupResult &lr) -// { -// RelAddress addr = lr.getAddress(); -// size_t size = lr.getSize(); -// RegionDescriptor desc(OpenStatus::Retry); -// while (desc.getStatus() == OpenStatus::Retry) -// { -// const auto seq_number = region_manager.getSeqNumber(); //TODO: why we need this? -// desc = region_manager.openForRead(addr.rid(), seq_number); -// } -// if (desc.getStatus() != OpenStatus::Ready) -// { -// // TODO: err codes -// throw Exception("fail to open region for read", ErrorCodes::BAD_ARGUMENTS); -// } -// chassert(desc.getStatus() == OpenStatus::Ready); - -// Buffer buffer(size); -// size_t bytes_read = readEntry(desc, addr, size, reinterpret_cast(buffer.data())); -// chassert(size == bytes_read); -// LOG_TRACE(log, "loadToMemoryCache, read {} bytes from addr=<{},{}>", bytes_read, addr.rid().index(), addr.offset()); -// region_manager.touch(addr.rid()); -// region_manager.close(std::move(desc)); - -// auto handle = lr.getHandler(); -// handle->loadedToMemory(buffer); -// return handle; -// } - } diff --git a/src/Storages/NexusFS/NexusFS.h b/src/Storages/NexusFS/NexusFS.h index 0cdcbbca17b..dc427efc26e 100644 --- a/src/Storages/NexusFS/NexusFS.h +++ b/src/Storages/NexusFS/NexusFS.h @@ -1,37 +1,42 @@ #pragma once #include +#include #include #include #include #include -#include +#include #include #include #include #include #include +#include #include #include +#include #include +#include #include #include #include #include -#include -#include -#include +#include +#include #include +#include +#include #include #include -#include -#include namespace DB { +class NexusFSBufferWithHandle; + class NexusFSConfig { public: @@ -50,7 +55,6 @@ class NexusFSConfig UInt64 cache_size{10 * GiB}; std::unique_ptr eviction_policy; - BlockCacheReinsertionConfig reinsertion_config{}; // Region size UInt64 region_size{1 * MiB}; @@ -60,6 +64,8 @@ class NexusFSConfig UInt32 io_align_size{1}; UInt32 stripe_size{4096}; + UInt32 reader_threads{4}; + UInt32 clean_regions_pool{2}; UInt32 clean_region_threads{2}; @@ -68,15 +74,25 @@ class NexusFSConfig UInt16 num_priorities{1}; - UInt64 memory_cache_size{1 * GiB}; + bool enable_memory_buffer{false}; + bool support_prefetch{true}; + UInt64 memory_buffer_size{1 * GiB}; + double memory_buffer_cooling_percent{0.1}; + double memory_buffer_freed_percent{0.05}; UInt32 timeout_ms{10000}; + UInt32 filemeta_gc_interval_s{300}; + + String file_prefix; + String file_surfix; // Calculate the total region number. UInt32 getNumberRegions() const { + chassert(0ul == metadata_size % region_size); chassert(0ul == cache_size % region_size); - return cache_size / region_size; + chassert(cache_size > metadata_size); + return (cache_size - metadata_size) / region_size; } void loadFromConfig(const Poco::Util::AbstractConfiguration & conf); @@ -85,7 +101,7 @@ class NexusFSConfig LoggerPtr log = getLogger("NexusFSConfig"); NexusFSConfig & validate(); - File openFile(const std::string & file_name, UInt64 size, bool truncate); + File openFile(const std::string & file_name, UInt64 size, bool truncate, bool direct_io); }; @@ -96,10 +112,18 @@ class NexusFS { off_t offset; size_t size; - OffsetAndSize(off_t offset_, size_t size_) : offset(offset_), size(size_) {} + OffsetAndSize(off_t offset_, size_t size_) : offset(offset_), size(size_) { } }; using OffsetAndSizeVector = std::vector; + struct InsertCxt + { + std::mutex mutex; + std::condition_variable cv; + HybridCache::Buffer buffer; + bool ready = false; + }; + explicit NexusFS(NexusFSConfig && config); NexusFS(const NexusFS &) = delete; NexusFS & operator=(const NexusFS &) = delete; @@ -108,11 +132,31 @@ class NexusFS UInt64 getSize() const { return region_manager.getSize(); } UInt64 getSegmentSize() const { return segment_size; } + bool supportNonCopyingRead() const { return enable_buffer; } + bool supportPrefetch() const { return support_prefetch; } + String getFilePrefix() const { return file_prefix; } + String getFileSurfix() const { return file_surfix; } + UInt64 getNumSegments() const { return num_segments.load(); } + UInt64 getNumInodes() const { return index.getNumInodes(); } + UInt64 getNumFileMetas() const { return index.getNumFileMetas(); } + std::vector getFileCachedStates() { return index.getFileCachedStates(); } + + HybridCache::FiberThread & getReadWorker() + { + return *(reader_workers[reader_task_counter.fetch_add(1, std::memory_order_relaxed) % reader_workers.size()]); + } + + // preload an array of segments to disk cache + void preload(const String & file, const OffsetAndSizeVector & offsets_and_sizes, std::unique_ptr & source); + // read from nexusfs + size_t read(const String & file, off_t offset, size_t max_size, std::unique_ptr & source, char * to); - void preload(const String &file, const OffsetAndSizeVector &offsets_and_sizes, std::unique_ptr &source); - size_t read(const String &file, off_t offset, size_t max_size, std::unique_ptr &source, char *to); + // read from nexusfs (non-copy) + NexusFSBufferWithHandle read(const String & file, off_t offset, size_t max_size, std::unique_ptr & source); + std::future + prefetchToBuffer(const String & file, off_t offset, size_t max_size, std::unique_ptr & source); void flush(); void drain(); @@ -122,24 +166,17 @@ class NexusFS bool recover(); private: - - struct InsertCxt - { - HybridCache::Buffer buffer; - bool ready = false; //TODO: use waiter - }; - class InFlightInserts { public: - std::shared_ptr getOrCreateContext(UInt32 key, bool &is_newly_created) + std::shared_ptr getOrCreateContext(const String & file_and_segment_id, bool & is_newly_created) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - auto it = map.find(key); + auto it = map.find(file_and_segment_id); if (it != map.end()) { is_newly_created = false; @@ -147,40 +184,40 @@ class NexusFS } is_newly_created = true; auto cxt = std::make_shared(); - map[key] = cxt; + map[file_and_segment_id] = cxt; return cxt; } } - std::shared_ptr getContext(UInt32 key) + std::shared_ptr getContext(const String & file_and_segment_id) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - auto it = map.find(key); + auto it = map.find(file_and_segment_id); if (it != map.end()) return it->second; return nullptr; } } - void removeContext(UInt32 key) + void removeContext(const String & file_and_segment_id) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - map.erase(key); + map.erase(file_and_segment_id); } } private: static constexpr UInt32 kShards = 8192; std::array mutexs; - std::array>, kShards> maps; + std::array>, kShards> maps; }; @@ -188,17 +225,74 @@ class NexusFS static constexpr UInt16 kDefaultItemPriority = 0; UInt64 getSegmentId(const off_t offset) const { return offset / segment_size; } - static String getSegmentName(const String file, const UInt64 segment_id) { return file + "#" + std::to_string(segment_id); } + static String getSegmentName(const String file, const UInt64 segment_id) { return file + "#" + std::to_string(segment_id); } off_t getOffsetInSourceFile(const UInt64 segment_id) const { return segment_id * segment_size; } off_t getOffsetInSegment(const off_t file_offset) const { return file_offset % segment_size; } - static size_t getReadSizeInSegment(const off_t offset_in_segemt, const size_t segment_size, const size_t buffer_size) { return std::min(buffer_size, segment_size - offset_in_segemt); } + static size_t getReadSizeInSegment(const off_t offset_in_segemt, const size_t segment_size, const size_t buffer_size) + { + return segment_size >= static_cast(offset_in_segemt) ? std::min(buffer_size, segment_size - offset_in_segemt) : 0; + } UInt32 alignedSize(UInt32 size) const { return roundup(size, alloc_align_size); } - NexusFSComponents::NexusFSIndex::LookupResult load(const HybridCache::HashedKey &key, off_t offset_in_source, std::unique_ptr &source, std::shared_ptr &insert_cxt); - - void writeEntry(HybridCache::RelAddress addr, UInt32 slot_size, const HybridCache::HashedKey &key, HybridCache::BufferView value); - - size_t readEntry(const HybridCache::RegionDescriptor &desc, HybridCache::RelAddress addr, UInt32 size, char *to); + std::tuple, std::shared_ptr, UInt64> open( + const String & segment_name, + const String & file, + UInt64 segment_id, + std::unique_ptr & source); + + std::pair + readFromInsertCxtInternal(std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size, char * to) const; + std::pair + readFromInsertCxtInternal(std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const; + std::pair readFromBufferInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size, char * to); + std::pair readFromBufferInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size); + std::pair readFromDiskInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size, char * to); + + // read form insert_cxt + std::pair readFromInsertCxt( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & cxt, + off_t offset_in_segment, + size_t max_size, + char * to) const; + // read form insert_cxt (non-copy) + std::pair readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const; + // read from buffer + std::pair readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to); + // read from buffer (non-copy) + std::pair readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size); + // read from disk + std::pair readFromDisk( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to); + + + void writeEntry(std::shared_ptr & handle, HybridCache::BufferView value); + + size_t readEntry(const HybridCache::RegionDescriptor & desc, HybridCache::RelAddress addr, UInt32 size, char * to); UInt32 onRegionReclaim(HybridCache::RegionId rid, HybridCache::BufferView buffer); @@ -212,15 +306,15 @@ class NexusFS kRemoved, kEvicted, }; - ReinsertionRes reinsertOrRemoveItem(UInt64 key, HybridCache::BufferView value, UInt32 entry_size, HybridCache::RelAddress addr); + ReinsertionRes reinsertOrRemoveItem(std::shared_ptr & handle); - bool removeItem(UInt64 key, HybridCache::RelAddress addr); + void removeItem(std::shared_ptr & handle); - std::shared_ptr makeReinsertionPolicy(const BlockCacheReinsertionConfig & reinsertion_config); - - NexusFSComponents::NexusFSIndex::LookupResult insert(const HybridCache::HashedKey &key, HybridCache::BufferView buf_view); - - // std::shared_ptr loadToMemoryCache(const NexusFSComponents::NexusFSIndex::LookupResult &lr); + std::shared_ptr insert( + const String & file, + UInt64 segment_id, + HybridCache::BufferView buf_view, + std::function()> get_file_and_segment_size); LoggerPtr log = getLogger("NexusFS"); @@ -236,20 +330,24 @@ class NexusFS const std::unique_ptr device; const UInt32 alloc_align_size{}; - const UInt64 region_size{}; const UInt64 metadata_size{}; const UInt32 segment_size{}; const UInt32 timeout_ms{}; - NexusFSComponents::NexusFSIndex index; + const String file_prefix; + const String file_surfix; + NexusFSComponents::InodeManager index; HybridCache::RegionManager region_manager; HybridCache::Allocator allocator; - std::shared_ptr reinsertion_policy; - InFlightInserts in_flight_inserts; - const bool enable_segment_cache; - // NexusFSComponents::SegmentCacheLRU segment_cache; + const bool enable_buffer; + const bool support_prefetch; + NexusFSComponents::BufferManager * buffer_manager; + std::vector> reader_workers; + mutable std::atomic reader_task_counter{0}; + + std::atomic num_segments{0}; }; } diff --git a/src/Storages/NexusFS/NexusFSBuffer.cpp b/src/Storages/NexusFS/NexusFSBuffer.cpp new file mode 100644 index 00000000000..2c29368d5bb --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBuffer.cpp @@ -0,0 +1,426 @@ +#include + +#include + +#include +#include +#include +#include +#include "Common/ProfileEvents.h" +#include +#include +#include +#include + + +namespace ProfileEvents +{ +extern const Event NexusFSBufferHit; +extern const Event NexusFSBufferMiss; +extern const Event NexusFSBufferPreload; +extern const Event NexusFSBufferPreloadRetry; +extern const Event NexusFSBufferEmptyCoolingQueue; +extern const Event NexusFSDiskCacheBytesRead; +} + +namespace DB::ErrorCodes +{ +extern const int CANNOT_OPEN_FILE; +} + +namespace DB::NexusFSComponents +{ + +void BufferState::loadAndPin(const std::unique_lock &, std::shared_ptr & handle_) +{ + chassert(state == State::COLD); + chassert(!handle); + state = State::HOT; + reader = 1; + handle = handle_; +} + +void BufferState::pin(const std::unique_lock &) +{ + chassert(handle); + state = State::HOT; + reader++; +} + +void BufferState::unpin(const std::unique_lock & l) +{ + if (state != BufferState::State::HOT || reader == 0 || !handle) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "try to unpin a {} whose has invalid {}", + handle ? handle->toStringSimple() : "BlockHandle(nullptr)", + toString(l)); + reader--; +} + +bool BufferState::markCooling(const std::unique_lock &) +{ + if (!handle) + return false; + if (state == State::HOT && reader == 0) + { + state = State::COOLING; + return true; + } + return false; +} + +bool BufferState::tryUnload(const std::unique_lock &) +{ + if (!handle) + return false; + if (state == State::COOLING) + { + if (reader != 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "try to unload a cooling {} whose reader={}", handle->toStringSimple(), reader); + state = State::COLD; + reader = 0; + handle->resetBufferSlot(); + handle.reset(); + return true; + } + return false; +} + +String BufferState::toString(const std::unique_lock &) +{ + return fmt::format( + "BufferState(state={}, reader={}, buffer={}, handle={})", + static_cast(state), + reader, + reinterpret_cast(buffer), + reinterpret_cast(handle.get())); +} + + +BufferState * BlockHandle::getBuffer(const std::unique_lock & lock) +{ + auto slot_id = buffer_slot.load(); + auto * buffer_manager = BufferManager::getInstance(); + chassert(slot_id != INVALID_SLOT_ID); + chassert(buffer_manager); + chassert(&buffer_manager->getMetaMutex(slot_id) == lock.mutex()); + return &buffer_manager->getMetaState(slot_id); +} + +BufferState * BlockHandle::getBufferStateAndLock(std::unique_lock & lock) +{ + auto slot_id = buffer_slot.load(); + while (true) + { + if (slot_id == INVALID_SLOT_ID) + return nullptr; + auto * buffer_manager = BufferManager::getInstance(); + chassert(buffer_manager); + lock = std::unique_lock{buffer_manager->getMetaMutex(slot_id)}; + auto recheck_slot_id = buffer_slot.load(); + if (recheck_slot_id == slot_id) + return &buffer_manager->getMetaState(slot_id); + else + { + lock.unlock(); + lock.release(); + slot_id = recheck_slot_id; + } + } +} + +bool BlockHandle::setBufferSlot(SlotId slot_id) +{ + auto expected = INVALID_SLOT_ID; + return buffer_slot.compare_exchange_strong(expected, slot_id); +} + + +void BlockHandle::unpin() +{ + std::unique_lock lock; + auto * state = getBufferStateAndLock(lock); + chassert(state); + state->unpin(lock); +} + +String BlockHandle::toString() +{ + std::unique_lock lock; + auto * state = getBufferStateAndLock(lock); + if (state) + { + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + state->toString(lock), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); + } + else + { + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state=null, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); + } +} + +String BlockHandle::toString(const std::unique_lock & lock) +{ + auto * state = getBuffer(lock); + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + state->toString(lock), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); +} + +String BlockHandle::toStringSimple() const +{ + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, buffer_slot={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + buffer_slot.load(), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); +} + + +std::unique_ptr BufferManager::buffer_manager = nullptr; + +BufferManager * BufferManager::initInstance( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_) +{ + if (buffer_manager) + throw Exception(ErrorCodes::LOGICAL_ERROR, "BufferManager already initialized"); + buffer_manager = std::make_unique( + buffer_size_, segment_size_, filemate_gc_interval_, cooling_percentage, freed_percentage, region_manager_, inode_manager_); + return buffer_manager.get(); +} + +BufferManager * BufferManager::getInstance() +{ + return buffer_manager.get(); +} + +BufferManager::BufferManager( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_) + : buffer_size(buffer_size_) + , slot_size(buffer_size_ / segment_size_) + , cooling_size(slot_size * cooling_percentage) + , freed_size(slot_size * freed_percentage) + , segment_size(segment_size_) + , filemate_gc_interval(filemate_gc_interval_) + , region_manager(region_manager_) + , inode_manager(inode_manager_) + , base_data(reinterpret_cast(mmap(nullptr, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0))) + , meta_locks(slot_size) + , free_list(folly::MPMCQueue(slot_size)) +{ + chassert(buffer_size_ % segment_size_ == 0); + chassert(base_data); + chassert(base_data % 4096 == 0); + + meta_states.reserve(slot_size); + for (size_t i = 0; i < slot_size; i++) + meta_states.emplace_back(BufferState(calculateBuffer(i))); + + for (SlotId i = 0; i < slot_size; i++) + free_list.write(i); + + cooling_and_gc_thread = std::thread([this] { coolDownBlocksAndGC(); }); +} + +BufferManager::~BufferManager() +{ + stop_cooling_and_gc.store(true, std::memory_order_relaxed); + cooling_and_gc_thread.join(); + + chassert(base_data); + munmap(reinterpret_cast(base_data), buffer_size); +} + +std::pair BufferManager::pin(std::shared_ptr & handle, UInt64 seq_number) +{ + std::unique_lock lock; + auto * state = handle->getBufferStateAndLock(lock); + if (!state) + { + ProfileEvents::increment(ProfileEvents::NexusFSBufferMiss); + return loadAndPin(handle, seq_number); + } + + ProfileEvents::increment(ProfileEvents::NexusFSBufferHit); + state->pin(lock); + auto buffer = state->getBuffer(); + chassert(buffer != 0); + return {OpResult::SUCCESS, buffer}; +} + +std::pair BufferManager::alloc() +{ + SlotId id; + if (!free_list.read(id)) + { + ProfileEvents::increment(ProfileEvents::NexusFSBufferEmptyCoolingQueue); + std::this_thread::yield(); + return {OpResult::RETRY, 0}; + } + LOG_TRACE(log, "erase slot {} from free_list", id); + return {OpResult::SUCCESS, id}; +} + +void BufferManager::free(SlotId slot_id) +{ + free_list.write(slot_id); + LOG_TRACE(log, "insert slot {} to free_list", slot_id); +} + +std::pair BufferManager::loadAndPin(std::shared_ptr & handle, const UInt64 seq_number) +{ + if (!handle->isRelAddressValid()) + return {OpResult::DEEP_RETRY, 0}; + + auto [op_result, slot_id] = alloc(); + if (op_result != OpResult::SUCCESS) + return {op_result, 0}; + + std::unique_lock l(meta_locks[slot_id]); + if (!handle->setBufferSlot(slot_id)) + { + LOG_TRACE( + log, + "try to set slot {} to BlockHandle({}, slot={}), but failed", + slot_id, + reinterpret_cast(handle.get()), + handle->getBufferSlot()); + free(slot_id); + return {OpResult::DEEP_RETRY, 0}; + } + + RelAddress addr = handle->getRelAddress(); + size_t size = handle->getSize(); + chassert(addr.rid().valid()); + chassert(size > 0); + + auto desc = region_manager.openForRead(addr.rid(), seq_number); + if (desc.getStatus() != HybridCache::OpenStatus::Ready) + { + handle->resetBufferSlot(); + free(slot_id); + if (desc.getStatus() == HybridCache::OpenStatus::Retry) + return {OpResult::DEEP_RETRY, 0}; + if (desc.getStatus() == HybridCache::OpenStatus::Error) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "fail to open region for read"); + } + + auto & state = getMetaState(slot_id); + chassert(!state.getHandle()); + uintptr_t buffer = state.getBuffer(); + chassert(buffer); + size_t bytes_read = region_manager.read(desc, addr, size, reinterpret_cast(buffer)); + chassert(size == bytes_read); + LOG_TRACE( + log, + "read {} bytes from disk(rid={}, offset={}, size={}) to buffer {}(slot={})", + bytes_read, + addr.rid().index(), + addr.offset(), + size, + reinterpret_cast(buffer), + slot_id); + ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesRead, size); + + region_manager.touch(addr.rid()); + region_manager.close(std::move(desc)); + + state.loadAndPin(l, handle); + + LOG_TRACE(log, "{} loadAndPin, buffer {}(slot={})", handle->toString(l), reinterpret_cast(buffer), slot_id); + + return {OpResult::SUCCESS, buffer}; +} + +void BufferManager::coolDownBlocksAndGC() +{ + SlotId cooling_itr = 0; + Stopwatch watch; + while (!stop_cooling_and_gc.load(std::memory_order_relaxed)) + { + size_t current_freed = free_list.size(); + if (current_freed >= freed_size) + { + if (watch.elapsedSeconds() >= filemate_gc_interval) + { + watch.restart(); + inode_manager.cleanInvalidFiles(); + } + else + std::this_thread::yield(); + continue; + } + + while (cooling_queue.size() < cooling_size) + { + std::unique_lock l(meta_locks[cooling_itr]); + if (meta_states[cooling_itr].markCooling(l)) + { + auto handle = meta_states[cooling_itr].getHandle(); + chassert(handle); + LOG_TRACE(log, "{} on slot {} turns cooling", handle->toString(l), cooling_itr); + cooling_queue.push(cooling_itr); + } + cooling_itr = (cooling_itr + 1) % slot_size; + } + + while (current_freed < freed_size && !cooling_queue.empty()) + { + SlotId id = cooling_queue.front(); + cooling_queue.pop(); + + std::unique_lock l(meta_locks[id]); + auto handle = meta_states[id].getHandle(); + if (meta_states[id].tryUnload(l)) + { + LOG_TRACE( + log, + "BlockHandle({}) unloaded, {}(slot={}) retrived", + reinterpret_cast(handle.get()), + meta_states[id].toString(l), + id); + free(id); + current_freed++; + } + } + } +} + +} diff --git a/src/Storages/NexusFS/NexusFSBuffer.h b/src/Storages/NexusFS/NexusFSBuffer.h new file mode 100644 index 00000000000..b6c80a2654b --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBuffer.h @@ -0,0 +1,186 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB::NexusFSComponents +{ + +using Mutex = folly::fibers::TimedMutex; +using SlotId = UInt32; +using HybridCache::RelAddress; +class InodeManager; +class BlockHandle; +class BufferManager; + +constexpr UInt32 INVALID_SLOT_ID = UINT32_MAX; + +enum class OpResult : UInt16 +{ + SUCCESS, + RETRY, + DEEP_RETRY, + ERROR +}; + +class BufferState +{ +public: + explicit BufferState(uintptr_t buffer_) : handle(nullptr), buffer(buffer_), reader(0), state(State::COLD) + { + chassert(buffer != 0); + } + + void loadAndPin(const std::unique_lock &, std::shared_ptr & handle); + + void pin(const std::unique_lock &l); + void unpin(const std::unique_lock &l); + + bool markCooling(const std::unique_lock &); + + bool tryUnload(const std::unique_lock &); + + std::shared_ptr getHandle() const { return handle; } + uintptr_t getBuffer() const { return buffer; } + UInt16 getReader() const { return reader; } + + String toString(const std::unique_lock &); + +private: + enum class State : UInt8 + { + HOT, + COOLING, + COLD + }; + + std::shared_ptr handle{nullptr}; + const uintptr_t buffer{0}; + UInt16 reader{0}; + State state{State::COLD}; +}; + + +class BlockHandle +{ +public: + explicit BlockHandle(RelAddress addr_, UInt32 size_) : addr(addr_), size(size_) { } + BlockHandle(const BlockHandle &) = delete; + BlockHandle & operator=(const BlockHandle &) = delete; + + UInt32 getSize() const { return size; } + RelAddress getRelAddress() const { return addr; } + bool isRelAddressValid() const { return addr.load().rid().valid(); } + void invalidRelAddress() { addr.store(RelAddress()); } + SlotId getBufferSlot() const { return buffer_slot.load(); } + void resetBufferSlot() { buffer_slot.store(INVALID_SLOT_ID); } + + BufferState * getBuffer(const std::unique_lock & lock); + BufferState * getBufferStateAndLock(std::unique_lock & lock); + + bool setBufferSlot(SlotId slot_id); + + void unpin(); + + String toString(); + String toString(const std::unique_lock &); + String toStringSimple() const; + +private: + friend class BufferManager; + + std::atomic addr{RelAddress()}; + const UInt32 size{0}; + std::atomic buffer_slot{INVALID_SLOT_ID}; +}; + + +class BufferManager +{ +public: + static std::unique_ptr buffer_manager; + static BufferManager * initInstance( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_ + ); + static BufferManager * getInstance(); + + explicit BufferManager( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_); + ~BufferManager(); + BufferManager(const BufferManager &) = delete; + BufferManager & operator=(const BufferManager &) = delete; + + std::pair pin(std::shared_ptr & handle, UInt64 seq_number); + +private: + friend class BlockHandle; + + Mutex & getMetaMutex(SlotId id) + { + chassert(id < slot_size); + return meta_locks[id]; + } + BufferState & getMetaState(SlotId id) + { + chassert(id < slot_size); + return meta_states[id]; + } + + uintptr_t calculateBuffer(SlotId slot_id) const { return base_data + static_cast(slot_id) * segment_size; } + + std::pair alloc(); + void free(SlotId slot_id); + + std::pair loadAndPin(std::shared_ptr & handle, UInt64 seq_number); + + void coolDownBlocksAndGC(); + + LoggerPtr log = getLogger("NexusFSBufferManager"); + + const size_t buffer_size{}; + const size_t slot_size{}; + const size_t cooling_size{}; + const size_t freed_size{}; + const UInt32 segment_size{}; + const UInt32 filemate_gc_interval{}; + + HybridCache::RegionManager & region_manager; + InodeManager & inode_manager; + + const uintptr_t base_data; + + // TODO: optimize with lock manager + std::vector meta_locks; + std::vector meta_states; + + std::queue cooling_queue; + folly::MPMCQueue free_list; + + std::atomic stop_cooling_and_gc{false}; + std::thread cooling_and_gc_thread; +}; + +} diff --git a/src/Storages/NexusFS/NexusFSBufferWithHandle.h b/src/Storages/NexusFS/NexusFSBufferWithHandle.h new file mode 100644 index 00000000000..5c84a28a55c --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBufferWithHandle.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class NexusFSBufferWithHandle +{ +public: + NexusFSBufferWithHandle() = default; + NexusFSBufferWithHandle(NexusFSBufferWithHandle && other) noexcept + : handle(std::move(other.handle)), buffer(std::move(other.buffer)), insert_cxt(std::move(other.insert_cxt)) + { + } + NexusFSBufferWithHandle & operator=(NexusFSBufferWithHandle && other) noexcept + { + if (this == &other) + return *this; + + reset(); + swap(handle, other.handle); + swap(buffer, other.buffer); + swap(insert_cxt, other.insert_cxt); + return *this; + } + ~NexusFSBufferWithHandle() { reset(); } + + void reset() + { + if (handle) + { + handle->unpin(); + handle.reset(); + } + if (buffer) + buffer.reset(); + if (insert_cxt) + insert_cxt.reset(); + } + + size_t getSize() const { return buffer ? buffer->available() : 0; } + BufferBase::Position getData() { return buffer ? buffer->position() : nullptr; } + +private: + friend class NexusFS; + + std::shared_ptr handle{nullptr}; + std::unique_ptr> buffer{nullptr}; + std::shared_ptr insert_cxt{nullptr}; +}; +} diff --git a/src/Storages/NexusFS/NexusFSIndex.cpp b/src/Storages/NexusFS/NexusFSIndex.cpp deleted file mode 100644 index bf44a6cf1c3..00000000000 --- a/src/Storages/NexusFS/NexusFSIndex.cpp +++ /dev/null @@ -1,208 +0,0 @@ -#include - -#include - -#include - -#include -#include -#include "Storages/DiskCache/Types.h" - -namespace DB::ErrorCodes -{ -extern const int INVALID_CONFIG_PARAMETER; -} - -namespace DB::NexusFSComponents -{ -namespace -{ - UInt8 safeInc(UInt8 val) - { - if (val < std::numeric_limits::max()) - return val + 1; - return val; - } -} - -void NexusFSIndex::setHits(UInt64 key, UInt8 current_hits, UInt8 total_hits) -{ - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - it.value().current_hits = current_hits; - it.value().total_hits = total_hits; - } -} - -NexusFSIndex::LookupResult NexusFSIndex::lookup(UInt64 key) -{ - LookupResult result; - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - it.value().total_hits = safeInc(result.record.total_hits); - it.value().current_hits = safeInc(result.record.current_hits); - } - return result; -} - -NexusFSIndex::LookupResult NexusFSIndex::peek(UInt64 key) const -{ - LookupResult result; - const auto & map = getMap(key); - auto lock = std::shared_lock{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - } - return result; -} - -NexusFSIndex::LookupResult NexusFSIndex::insert(UInt64 key, RelAddress address, UInt32 size) -{ - LookupResult result; - auto & map = getMap(key); - // auto handle = std::make_shared(); - // handle->loadedToDisk(address); - - auto guard = std::lock_guard{getMutex(key)}; - auto ret = map.try_emplace(subkey(key), address, size); - chassert(ret.second); - result.found = true; - result.record = ret.first->second; - - return result; -} - -// bool NexusFSIndex::replaceIfMatch(UInt64 key, RelAddress new_address, RelAddress old_address) -// { -// auto & map = getMap(key); -// auto guard = std::lock_guard{getMutex(key)}; - -// auto it = map.find(subkey(key)); -// if (it != map.end() && it->second.address == old_address) -// { -// it.value().address = new_address; -// it.value().current_hits = 0; -// return true; -// } -// return false; -// } - -void NexusFSIndex::trackRemove(UInt8 total_hits) -{ - if (total_hits == 0) - unaccessed_items++; -} - -NexusFSIndex::LookupResult NexusFSIndex::remove(UInt64 key) -{ - LookupResult result; - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - - trackRemove(it->second.total_hits); - map.erase(it); - } - return result; -} - -bool NexusFSIndex::removeIfMatch(UInt64 key, RelAddress address) -{ - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end() && it->second.address == address) - { - trackRemove(it->second.total_hits); - map.erase(it); - return true; - } - return false; -} - -void NexusFSIndex::reset() -{ - for (UInt32 i = 0; i < kNumBuckets; i++) - { - auto guard = std::lock_guard{getMutexOfBucket(i)}; - buckets[i].clear(); - } - unaccessed_items = 0; -} - -size_t NexusFSIndex::compuiteSize() const -{ - size_t size = 0; - for (UInt32 i = 0; i < kNumBuckets; i++) - { - auto guard = std::lock_guard{getMutexOfBucket(i)}; - size += buckets[i].size(); - } - return size; -} - -void NexusFSIndex::persist(google::protobuf::io::CodedOutputStream * stream) const -{ - Protos::NexusFSIndexBucket bucket; - for (UInt32 i = 0; i < kNumBuckets; i++) - { - bucket.set_bucket_id(i); - for (const auto & [key, v] : buckets[i]) - { - auto * entry = bucket.add_entries(); - entry->set_key(key); - entry->set_address_rid(v.address.rid().index()); - entry->set_address_offset(v.address.offset()); - entry->set_size(v.size); - entry->set_total_hits(v.total_hits); - entry->set_current_hits(v.current_hits); - } - - google::protobuf::util::SerializeDelimitedToCodedStream(bucket, stream); - bucket.clear_entries(); - } -} - -void NexusFSIndex::recover(google::protobuf::io::CodedInputStream * stream) -{ - for (UInt32 i = 0; i < kNumBuckets; i++) - { - Protos::NexusFSIndexBucket bucket; - google::protobuf::util::ParseDelimitedFromCodedStream(&bucket, stream, nullptr); - UInt32 id = bucket.bucket_id(); - if (id >= kNumBuckets) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Invalid bucket id. Max buckets: {}, bucket id: {}", kNumBuckets, id); - - for (const auto & entry : bucket.entries()) - { - buckets[id].try_emplace( - entry.key(), - RelAddress(HybridCache::RegionId(entry.address_rid()), entry.address_offset()), - static_cast(entry.size()), - static_cast(entry.total_hits()), - static_cast(entry.current_hits())); - } - } -} - -} diff --git a/src/Storages/NexusFS/NexusFSIndex.h b/src/Storages/NexusFS/NexusFSIndex.h deleted file mode 100644 index 7da3e197d1f..00000000000 --- a/src/Storages/NexusFS/NexusFSIndex.h +++ /dev/null @@ -1,180 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include "IO/BufferWithOwnMemory.h" -#include "QueryPlan/Void.h" -#include "Storages/DiskCache/Buffer.h" -#include "Storages/DiskCache/Types.h" - - - -namespace ProfileEvents -{ - extern const Event NexusFSMemoryBufferHit; - extern const Event NexusFSMemoryBufferMiss; - extern const Event NexusFSMemoryBufferEvict; -} - -namespace DB::NexusFSComponents -{ -using HybridCache::RelAddress; - -class NexusFSIndex -{ -public: - using SharedMutex = folly::fibers::TimedRWMutexWritePriority; - - NexusFSIndex() = default; - NexusFSIndex(const NexusFSIndex &) = delete; - NexusFSIndex & operator=(const NexusFSIndex &) = delete; - - void persist(google::protobuf::io::CodedOutputStream * stream) const; - void recover(google::protobuf::io::CodedInputStream * stream); - - struct PACKED_LINLINE ItemRecord - { - // address in device - // std::shared_ptr handle; - RelAddress address{RelAddress()}; - // item size - UInt32 size{0}; - // total hits during this item's entire lifetime in cache - UInt8 total_hits{0}; - // hits during the current window for this item - UInt8 current_hits{0}; - - // explicit ItemRecord(std::shared_ptr handle_ = nullptr, UInt32 size_ = 0, UInt8 total_hits_ = 0, UInt8 current_hits_ = 0) - // : handle(handle_), size(size_), total_hits(total_hits_), current_hits(current_hits_) - // { - // } - explicit ItemRecord(RelAddress address_ = RelAddress(), UInt32 size_ = 0, UInt8 total_hits_ = 0, UInt8 current_hits_ = 0) - : address(address_), size(size_), total_hits(total_hits_), current_hits(current_hits_) - { - } - }; - // static_assert(14 == sizeof(ItemRecord), "ItemRecord size is 14 bytes"); - - struct LookupResult - { - friend class NexusFSIndex; - - bool isFound() const { return found; } - - ItemRecord getRecord() const - { - chassert(found); - return record; - } - - // std::shared_ptr getHandler() const - // { - // chassert(found); - // return record.handle; - // } - - RelAddress getAddress() const - { - chassert(found); - return record.address; - } - - UInt32 getSize() const - { - chassert(found); - return record.size; - } - - UInt8 getTotalHits() const - { - chassert(found); - return record.total_hits; - } - - UInt8 getCurrentHits() const - { - chassert(found); - return record.current_hits; - } - - private: - ItemRecord record; - bool found{false}; - }; - - // Gets value and update tracking counters - LookupResult lookup(UInt64 key); - - // Gets value without updating tracking counters - LookupResult peek(UInt64 key) const; - - // Overwrites existing key if exists with new address adn size. If the entry was successfully overwritten, - // LookupResult returns . - LookupResult insert(UInt64 key, RelAddress address, UInt32 size); - - // // Replaces old address with new address if there exists the key with the identical old address. - // bool replaceIfMatch(UInt64 key, RelAddress new_address, RelAddress old_address); - - // If the entry was successfully removed, LookupResult returns . - LookupResult remove(UInt64 key); - - // Removes only if both key and address match. - bool removeIfMatch(UInt64 key, RelAddress address); - - // Update hits information of a key. - void setHits(UInt64 key, UInt8 current_hits, UInt8 total_hits); - - // Resets all the buckets to the initial state. - void reset(); - - // Walks buckets and computes total index entry count - size_t compuiteSize() const; - -private: - static constexpr UInt32 kNumBuckets{64 * 1024}; - static constexpr UInt32 kNumMutexes{1024}; - - using Map = tsl::sparse_map; - - static UInt32 bucket(UInt64 hash) { return (hash >> 32) & (kNumBuckets - 1); } - - static UInt32 subkey(UInt64 hash) { return hash & 0xffffffffu; } - - SharedMutex & getMutexOfBucket(UInt32 bucket) const - { - chassert(isPowerOf2(kNumMutexes)); - return mutex[bucket & (kNumMutexes - 1)]; - } - - SharedMutex & getMutex(UInt64 hash) const - { - auto b = bucket(hash); - return getMutexOfBucket(b); - } - - Map & getMap(UInt64 hash) const - { - auto b = bucket(hash); - return buckets[b]; - } - - void trackRemove(UInt8 total_hits); - - mutable Poco::AtomicCounter unaccessed_items; - - std::unique_ptr mutex{new SharedMutex[kNumMutexes]}; - std::unique_ptr buckets{new Map[kNumBuckets]}; -}; - -} diff --git a/src/Storages/NexusFS/NexusFSInodeManager.cpp b/src/Storages/NexusFS/NexusFSInodeManager.cpp new file mode 100644 index 00000000000..6d8abd1ee12 --- /dev/null +++ b/src/Storages/NexusFS/NexusFSInodeManager.cpp @@ -0,0 +1,392 @@ +#include + +#include +#include + +#include + +#include +#include "common/defines.h" +#include "common/logger_useful.h" +#include "common/types.h" +#include +#include +#include "IO/WriteHelpers.h" + +namespace ProfileEvents +{ +extern const Event NexusFSInodeManagerLookupMicroseconds; +extern const Event NexusFSInodeManagerInsertMicroseconds; +} + +namespace DB::ErrorCodes +{ +extern const int INVALID_CONFIG_PARAMETER; +extern const int CANNOT_OPEN_FILE; +} + +namespace DB::NexusFSComponents +{ + +std::shared_ptr FileMeta::getHandle(UInt64 segment_id) +{ + std::lock_guard l(mutex); + if (segment_id >= segments.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta::getHandle for segment_id {} out of bound", segment_id); + if (segments[segment_id] && !segments[segment_id]->isRelAddressValid()) + segments[segment_id].reset(); + return segments[segment_id]; +} + +void FileMeta::setHandle(UInt64 segment_id, std::shared_ptr & handle) +{ + std::lock_guard l(mutex); + if (segment_id >= segments.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta::setHandle for segment_id {} out of bound", segment_id); + segments[segment_id] = handle; +} + +void FileMeta::toProto(Protos::NexusFSFileMeta * proto) +{ + std::lock_guard l(mutex); + proto->set_file_size(file_size); + for (size_t i = 0; i < segments.size(); i++) + { + const auto & handle = segments[i]; + if (handle) + { + Protos::NexusFSFileSegment * proto_handle = proto->add_segments(); + proto_handle->set_segment_id(i); + proto_handle->set_address_rid(handle->getRelAddress().rid().index()); + proto_handle->set_address_offset(handle->getRelAddress().offset()); + proto_handle->set_size(handle->getSize()); + } + } +} + +bool FileMeta::canBeRemoved() +{ + bool has_valid_handle = false; + std::lock_guard l(mutex); + for (auto & segment : segments) + { + if (segment) + { + if (segment->isRelAddressValid()) + has_valid_handle = true; + else + segment.reset(); + } + } + return has_valid_handle; +} + +std::pair FileMeta::getCachedSizeAndSegments() +{ + std::lock_guard l(mutex); + UInt64 cached_segments = 0; + UInt64 cached_size = 0; + for (auto & segment : segments) + { + if (segment) + { + if (segment->isRelAddressValid()) + { + cached_segments++; + cached_size += segment->getSize(); + } + else + segment.reset(); + } + } + return {cached_size, cached_segments}; +} + + +std::shared_ptr Inode::getHandle(String & file, UInt64 segment_id) +{ + auto it = files.find(file); + if (it == files.end()) + return nullptr; + + const auto & meta = it->second; + if (!meta) + return nullptr; + + return meta->getHandle(segment_id); +} + +void Inode::setHandle( + const String & file, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size, + std::atomic & num_file_metas) +{ + auto it = files.find(file); + if (it == files.end()) + { + auto [file_size, segment_size] = get_file_and_segment_size(); + auto meta = std::make_shared(file_size, segment_size); + it = files.try_emplace(file, meta).first; + num_file_metas++; + } + + const auto & meta = it->second; + if (!meta) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta for file {} not found", file); + + meta->setHandle(segment_id, handle); +} + +void Inode::setHandle(const String & file, std::shared_ptr & file_meta) +{ + if (!files.try_emplace(file, file_meta).second) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta for file {} already exists", file); +} + +void Inode::cleanInvalidFiles(std::atomic & num_file_metas) +{ + for (auto [file_name, file_meta] : files) + { + if (file_meta->canBeRemoved()) + { + files.erase(file_name); + num_file_metas--; + } + } +} + +void Inode::toProto(Protos::NexusFSInode * node) +{ + node->set_node_id(id); + for (const auto & [key, meta] : files) + { + Protos::NexusFSFileMeta * file = node->add_files(); + file->set_file_name(key); + meta->toProto(file); + } +} + +void Inode::getFileCachedStates(std::vector & result) +{ + for (const auto & [file_name, file_meta] : files) + { + FileCachedState state; + auto [total_size, total_segment] = file_meta->getTotalSizeAndSegments(); + auto [cached_size, cached_segment] = file_meta->getTotalSizeAndSegments(); + result.emplace_back(FileCachedState{ + .file_path = file_name, + .total_segments = total_segment, + .cached_segments = cached_segment, + .total_size = total_size, + .cached_size = cached_size}); + } +} + + +std::shared_ptr InodeManager::lookup(const String & path, UInt64 segment_id) const +{ + // TODO: increase hits + return peek(path, segment_id); +} + +std::shared_ptr InodeManager::peek(const String & path, UInt64 segment_id) const +{ + ProfileEventTimeIncrement watch(ProfileEvents::NexusFSInodeManagerLookupMicroseconds); + + std::vector dirs; + resolvePath(path, dirs); + chassert(!dirs.empty()); + + String file = dirs.back(); + dirs.pop_back(); + + UInt64 pid = 0; + auto it = inodes.end(); + for (auto & dir : dirs) + { + String pid_dir = toString(pid) + "/" + dir; + it = inodes.find(pid_dir); + if (it == inodes.end()) + { + return nullptr; + } + else + { + pid = it->second->getId(); + } + } + + const auto & inode = dirs.empty() ? root_inode : it->second; + return inode->getHandle(file, segment_id); +} + +void InodeManager::insert( + const String & path, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size) +{ + ProfileEventTimeIncrement watch(ProfileEvents::NexusFSInodeManagerInsertMicroseconds); + + std::vector dirs; + resolvePath(path, dirs); + + String file = dirs.back(); + dirs.pop_back(); + + UInt64 pid = 0; + auto it = inodes.end(); + for (const auto & dir : dirs) + { + String pid_dir = toString(pid) + "/" + dir; + it = inodes.find(pid_dir); + if (it == inodes.end()) + { + it = inodes.try_emplace(pid_dir, std::make_shared(inode_id.fetch_add(1))).first; + num_inodes++; + } + pid = it->second->getId(); + } + + const auto & inode = dirs.empty() ? root_inode : it->second; + inode->setHandle(file, segment_id, handle, get_file_and_segment_size, num_file_metas); +} + +void InodeManager::reset() +{ + inodes.clear(); + num_inodes = 1; +} + +void InodeManager::persist(google::protobuf::io::CodedOutputStream * stream) const +{ + Protos::NexusFSInodeManager manager; + manager.set_prefix(prefix); + manager.set_surfix(surfix); + auto * root_inode_proto = manager.mutable_root_inode(); + root_inode_proto->set_node_key(""); + root_inode->toProto(root_inode_proto); + for (const auto & [key, val] : inodes) + { + auto * node = manager.add_inodes(); + node->set_node_key(key); + val->toProto(node); + } + google::protobuf::util::SerializeDelimitedToCodedStream(manager, stream); +} + +void InodeManager::recover( + google::protobuf::io::CodedInputStream * stream, HybridCache::RegionManager & region_manager, std::atomic & num_segments) +{ + Protos::NexusFSInodeManager manager; + google::protobuf::util::ParseDelimitedFromCodedStream(&manager, stream, nullptr); + + if (manager.prefix() != prefix || manager.surfix() != surfix) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Invalid prefix or surfix. Expected prefix: {}, surfix: {}, actual prefix: {}, surfix: {}", + prefix, + surfix, + manager.prefix(), + manager.surfix()); + + auto recover_files_in_inode = [&](std::shared_ptr & node, const Protos::NexusFSInode & proto_node) + { + for (const auto & proto_file : proto_node.files()) + { + auto file = std::make_shared(proto_file.file_size(), segment_size); + num_file_metas++; + for (const auto & proto_seg : proto_file.segments()) + { + auto rid = HybridCache::RegionId(proto_seg.address_rid()); + auto addr = RelAddress(rid, proto_seg.address_offset()); + auto handle = std::make_shared(addr, proto_seg.size()); + file->setHandle(proto_seg.segment_id(), handle); + region_manager.getRegion(rid).addHandle(handle); + num_segments++; + } + node->setHandle(proto_file.file_name(), file); + } + }; + + recover_files_in_inode(root_inode, manager.root_inode()); + for (const auto & proto_node : manager.inodes()) + { + auto inode = std::make_shared(proto_node.node_id()); + num_inodes++; + recover_files_in_inode(inode, proto_node); + inodes.emplace(proto_node.node_key(), inode); + } +} + + +String InodeManager::extractValidPath(const String & path) const +{ + if (path.size() <= prefix.size() + surfix.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} invalid, its length is smaller than prefix + surfix", path); + if (prefix != path.substr(0, prefix.size())) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} has invalid prefix, required prefix should be {}", path, prefix); + if (surfix != path.substr(path.size() - surfix.size(), surfix.size())) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} has invalid surfix, required surfix should be {}", path, surfix); + + String valid_path = path.substr(prefix.size(), path.size() - prefix.size() - surfix.size()); + if (valid_path.empty()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} is invalid, it consists of only prefix and suffix. ", path); + + return valid_path; +} + + +void InodeManager::resolvePath(const String & path, std::vector & ressolved_dirs) const +{ + String valid_path = extractValidPath(path); + + String dir; + for (auto ch : valid_path) + { + if (ch == '/') + { + if (!dir.empty()) + { + ressolved_dirs.push_back(dir); + dir.clear(); + } + } + else + { + dir.push_back(ch); + } + } + if (!dir.empty()) + ressolved_dirs.push_back(dir); + + if (ressolved_dirs.empty()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} is invalid, it consists of only prefix and suffix. ", path); + + { + String str = ressolved_dirs[0]; + for (size_t i = 1; i < ressolved_dirs.size(); i++) + { + str += "," + ressolved_dirs[i]; + } + LOG_TRACE(log, "resolvePath get: {}", str); + } +} + +void InodeManager::cleanInvalidFiles() +{ + for (auto [_, node] : inodes) + node->cleanInvalidFiles(num_file_metas); +} + +std::vector InodeManager::getFileCachedStates() +{ + std::vector ret; + root_inode->getFileCachedStates(ret); + for (auto [dir_name, node] : inodes) + node->getFileCachedStates(ret); + return ret; +} +} diff --git a/src/Storages/NexusFS/NexusFSInodeManager.h b/src/Storages/NexusFS/NexusFSInodeManager.h new file mode 100644 index 00000000000..23032a9cad8 --- /dev/null +++ b/src/Storages/NexusFS/NexusFSInodeManager.h @@ -0,0 +1,151 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include "IO/BufferWithOwnMemory.h" +#include "Storages/DiskCache/Buffer.h" +#include "Storages/DiskCache/Types.h" +#include "Storages/NexusFS/NexusFSBuffer.h" + +#include + + +namespace DB::NexusFSComponents +{ + +struct FileCachedState +{ + String file_path; + UInt64 total_segments; + UInt64 cached_segments; + UInt64 total_size; + UInt64 cached_size; +}; + +class FileMeta +{ +public: + explicit FileMeta(size_t file_size_, UInt32 segment_size) : file_size(file_size_) + { + chassert(segment_size > 0); + size_t segments_count = (file_size + segment_size - 1) / segment_size; + segments.resize(segments_count); + } + FileMeta(const FileMeta &) = delete; + FileMeta & operator=(const FileMeta &) = delete; + + size_t getFileSize() const { return file_size; } + std::pair getTotalSizeAndSegments() const { return {file_size, segments.size()}; } + std::pair getCachedSizeAndSegments(); + + std::shared_ptr getHandle(UInt64 segment_id); + void setHandle(UInt64 segment_id, std::shared_ptr & handle); + + void toProto(Protos::NexusFSFileMeta * proto); + + bool canBeRemoved(); + +private: + folly::fibers::TimedMutex mutex; + const size_t file_size; + std::vector> segments; +}; + +class Inode +{ +public: + explicit Inode(UInt64 id_) : id(id_) { } + Inode(const Inode &) = delete; + Inode & operator=(const Inode &) = delete; + + UInt64 getId() const { return id; } + + std::shared_ptr getHandle(String & file, UInt64 segment_id); + void setHandle( + const String & file, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size, + std::atomic & num_file_metas); + void setHandle(const String & file, std::shared_ptr & file_meta); + + void toProto(Protos::NexusFSInode * node); + + void cleanInvalidFiles(std::atomic & num_file_metas); + + void getFileCachedStates(std::vector & result); + +private: + UInt64 id; + folly::ConcurrentHashMap> files; +}; + +class InodeManager +{ +public: + explicit InodeManager(const String & prefix_, const String & surfix_, const UInt32 segment_size_) + : prefix(prefix_), surfix(surfix_), segment_size(segment_size_), root_inode(std::make_shared(0)) + { + num_inodes++; + } + InodeManager(const InodeManager &) = delete; + InodeManager & operator=(const InodeManager &) = delete; + + + void persist(google::protobuf::io::CodedOutputStream * stream) const; + void recover( + google::protobuf::io::CodedInputStream * stream, HybridCache::RegionManager & region_manager, std::atomic & num_segments); + + // Gets value and update tracking counters + std::shared_ptr lookup(const String & path, UInt64 segment_id) const; + + // Gets value without updating tracking counters + std::shared_ptr peek(const String & path, UInt64 segment_id) const; + + // Overwrites existing key if exists with new address adn size. If the entry was successfully overwritten, + // LookupResult returns . + void insert( + const String & path, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size); + + // Resets all the buckets to the initial state. + void reset(); + + void cleanInvalidFiles(); + + UInt64 getNumInodes() const { return num_inodes.load(); } + UInt64 getNumFileMetas() const { return num_file_metas.load(); } + + std::vector getFileCachedStates(); + +private: + String extractValidPath(const String & path) const; + void resolvePath(const String & path, std::vector & ressolved_dirs) const; + + LoggerPtr log = getLogger("NexusFSInodeManager"); + + const String prefix; + const String surfix; + const UInt32 segment_size; + std::atomic inode_id{1}; + std::shared_ptr root_inode; + folly::ConcurrentHashMap> inodes; + + std::atomic num_inodes{0}; + std::atomic num_file_metas{0}; +}; + +} diff --git a/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp b/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp new file mode 100644 index 00000000000..f871be3c306 --- /dev/null +++ b/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp @@ -0,0 +1,177 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB::NexusFSComponents +{ + +using namespace HybridCache; + +TEST(NexusFSInodeManager, GetAndSet) +{ + auto get_file_and_segment_size = []() { return std::make_pair(5, 1); }; + InodeManager index("/prefix/", "/data", 1); + auto h1 = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert("/prefix//AA/BB/CC/data", 0, h1, get_file_and_segment_size); + auto h2 = index.lookup("/prefix/AA/BB//CC//data", 0); + EXPECT_EQ(h1.get(), h2.get()); + EXPECT_TRUE(index.lookup("/prefix/AA/BB/CC/data", 0)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CC/data", 1)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CC/data", 1)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CCX/data", 0)); + EXPECT_FALSE(index.lookup("/prefix/AA/AA/BB/CC/data", 0)); + + EXPECT_FALSE(index.lookup("/prefix/AA/BB/BB/data", 1)); + auto h3 = std::make_shared(RelAddress(RegionId(2), 3), 4); + index.insert("/prefix/AA/BB/BB/data", 1, h3, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/AA/BB/BB/data", 1)); + + EXPECT_FALSE(index.lookup("/prefix/0/data", 1)); + auto h4 = std::make_shared(RelAddress(RegionId(5), 3), 4); + index.insert("/prefix/0/data", 1, h4, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/0/data", 1)); +} + +TEST(NexusFSInodeManager, InvalidPath) +{ + InodeManager index("/prefix/", "/data", 128); + EXPECT_THROW({ index.lookup("", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix1/AA/BB/CC/data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/BB/CC/data2", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix//data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix///data", 0); }, Exception); +} + +TEST(NexusFSInodeManager, InvalidSegmentId) +{ + auto get_file_and_segment_size = []() { return std::make_pair(5, 1); }; + InodeManager index("/prefix/", "/data", 1); + EXPECT_FALSE(index.lookup("/prefix/AA/data", 1)); + auto h1 = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert("/prefix/AA/data", 1, h1, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/AA/data", 1)); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 5); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 6); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 999); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 5, h1, get_file_and_segment_size); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 6, h1, get_file_and_segment_size); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 999, h1, get_file_and_segment_size); }, Exception); +} + +TEST(NexusFSInodeManager, ThreadSafe) +{ + auto get_file_and_segment_size = []() { return std::make_pair(20, 1); }; + InodeManager index("/prefix/", "/data", 128); + const String file = "/prefix/AA/BB/CC/DD/EE/data"; + auto handle = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert(file, 10, handle, get_file_and_segment_size); + + auto lookup = [&]() { index.lookup(file, 10); }; + + std::vector threads; + threads.reserve(200); + for (int i = 0; i < 200; i++) + { + threads.emplace_back(lookup); + } + + for (auto & t : threads) + { + t.join(); + } + + // TODO: hits + // EXPECT_EQ(200, index.peek(key).getTotalHits()); + // EXPECT_EQ(200, index.peek(key).getCurrentHits()); +} + +TEST(NexusFSInodeManager, Recovery) +{ + auto get_file_and_segment_size = []() { return std::make_pair(10, 1); }; + InodeManager index("/prefix/", "/data", 1); + std::vector> log; + for (UInt64 i = 0; i < 16; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/123/" + toString(i) + "/data"; + auto handle = std::make_shared(RelAddress(RegionId(i), j), 1); + index.insert(file, j, handle, get_file_and_segment_size); + log.emplace_back(file, j); + } + } + for (UInt64 i = 16; i < 20; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/" + toString(i) + "/data"; + auto handle = std::make_shared(RelAddress(RegionId(i), j), 1); + index.insert(file, j, handle, get_file_and_segment_size); + log.emplace_back(file, j); + } + } + { + auto files = index.getFileCachedStates(); + EXPECT_EQ(20, files.size()); + for (auto & file : files) + { + EXPECT_EQ(10, file.cached_size); + EXPECT_EQ(10, file.total_size); + EXPECT_EQ(10, file.cached_segments); + EXPECT_EQ(10, file.total_segments); + } + } + + Buffer metadata(INT_MAX); + + { + google::protobuf::io::ArrayOutputStream raw_stream(metadata.data(), INT_MAX); + google::protobuf::io::CodedOutputStream ostream(&raw_stream); + + index.persist(&ostream); + } + + auto device = createMemoryDevice(4096 * 20, 4096); + auto policy = std::make_unique(); + RegionManager region_manager(20, 4096, 0, *device, 1, 1, {}, {}, std::move(policy), 2, 4, 10); + std::atomic num_segments = 0; + InodeManager new_index("/prefix/", "/data", 1); + google::protobuf::io::ArrayInputStream raw_stream(metadata.data(), INT_MAX); + google::protobuf::io::CodedInputStream istream(&raw_stream); + new_index.recover(&istream, region_manager, num_segments); + for (auto & entry : log) + { + EXPECT_TRUE(new_index.lookup(entry.first, entry.second)); + } + for (UInt64 i = 20; i < 24; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/123/" + toString(i) + "/data"; + EXPECT_FALSE(new_index.lookup(file, j)); + } + } + { + auto files = index.getFileCachedStates(); + EXPECT_EQ(20, files.size()); + for (auto & file : files) + { + EXPECT_EQ(10, file.cached_size); + EXPECT_EQ(10, file.total_size); + EXPECT_EQ(10, file.cached_segments); + EXPECT_EQ(10, file.total_segments); + } + } +} + +} diff --git a/src/Storages/RemoteFile/CnchFileSettings.h b/src/Storages/RemoteFile/CnchFileSettings.h index 92a79f3ae0c..da7b4aa59e8 100644 --- a/src/Storages/RemoteFile/CnchFileSettings.h +++ b/src/Storages/RemoteFile/CnchFileSettings.h @@ -18,7 +18,8 @@ struct Settings; M(String, cnch_vw_task, "vw_task", "", 0) \ M(String, resources_assign_type, "server_push", "", 0) \ M(Bool, simple_hash_resources, true, "", 0) \ - M(Bool, cnch_temporary_table, false, "", 0) + M(Bool, cnch_temporary_table, false, "", 0) \ + M(Bool, prefer_cnch_catalog, true, "", 0) /// Settings that should not change after the creation of a table. #define APPLY_FOR_IMMUTABLE_CNCH_FILE_SETTINGS(M) M(index_granularity) diff --git a/src/Storages/RemoteFile/IStorageCloudFile.cpp b/src/Storages/RemoteFile/IStorageCloudFile.cpp index ad1d931776a..88588ebd825 100644 --- a/src/Storages/RemoteFile/IStorageCloudFile.cpp +++ b/src/Storages/RemoteFile/IStorageCloudFile.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -216,9 +217,9 @@ class PartitionedFileBlockOutputStream : public PartitionedBlockOutputStream BlockOutputStreamPtr createStreamForPartition(const String & partition_id) override { - auto path = PartitionedBlockOutputStream::replaceWildcards(uri, partition_id); + auto path = PartitionedBlockOutputStream::replaceWildcards(uri, partition_id, query_context->getPlanSegmentInstanceId().parallel_index); PartitionedBlockOutputStream::validatePartitionKey(path, true); - return std::make_shared(global_context, client, path, format, sample_block, compression_method); + return std::make_shared(query_context, client, path, format, sample_block, compression_method); } private: @@ -434,7 +435,7 @@ void IStorageCloudFile::read( BlockOutputStreamPtr IStorageCloudFile::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, const ContextPtr query_context) { - String current_uri = file_list.back(); + String current_uri = file_list.front(); bool has_wildcards = current_uri.find(PartitionedFileBlockOutputStream::PARTITION_ID_WILDCARD) != String::npos; const auto * insert_query = dynamic_cast(query.get()); auto partition_by_ast = insert_query ? (insert_query->partition_by ? insert_query->partition_by : arguments.partition_by) : nullptr; @@ -452,20 +453,28 @@ BlockOutputStreamPtr IStorageCloudFile::write(const ASTPtr & query, const Storag else { if (arguments.is_glob_path) - throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs, so the table is in readonly mode", arguments.url); + { + if (!has_wildcards) + throw Exception(ErrorCodes::DATABASE_ACCESS_DENIED, "URI '{}' contains globs and no `{}`, so the table is in readonly mode", arguments.url, PartitionedFileBlockOutputStream::PARTITION_ID_WILDCARD); + if (!partition_by_ast) + { + current_uri = PartitionedBlockOutputStream::replaceWildcards(current_uri, "", query_context->getPlanSegmentInstanceId().parallel_index); + LOG_TRACE(log, "URI `{}` has `{}` but no partition by so replace wildcards without parition_id to {}", arguments.url, PartitionedFileBlockOutputStream::PARTITION_ID_WILDCARD, current_uri); + } + } FileURI file_uri(current_uri); if (client->exist(file_uri.file_path) && !query_context->getSettingsRef().overwrite_current_file) { if (query_context->getSettingsRef().insert_new_file) { - auto pos = file_list[0].find_first_of('.', file_list[0].find_last_of('/')); + auto pos = current_uri.find_first_of('.', current_uri.find_last_of('/')); size_t index = file_list.size(); String new_uri; do { - new_uri = file_list[0].substr(0, pos) + "." + std::to_string(index) - + (pos == std::string::npos ? "" : file_list[0].substr(pos)); + new_uri = current_uri.substr(0, pos) + "." + std::to_string(index) + + (pos == std::string::npos ? "" : current_uri.substr(pos)); ++index; } while (client->exist(new_uri)); diff --git a/src/Storages/RemoteFile/IStorageCnchFile.cpp b/src/Storages/RemoteFile/IStorageCnchFile.cpp index bb2179dc0ab..3aeecbf4385 100644 --- a/src/Storages/RemoteFile/IStorageCnchFile.cpp +++ b/src/Storages/RemoteFile/IStorageCnchFile.cpp @@ -306,7 +306,8 @@ void IStorageCnchFile::checkAlterSettings(const AlterCommands & commands) const "cnch_vw_write", //not use currently "cnch_vw_task", //not use currently "resources_assign_type", - "simple_hash_resources"}; + "simple_hash_resources", + "prefer_cnch_catalog"}; /// Check whether the value is legal for Setting. /// For example, we have a setting item, `SettingBool setting_test` @@ -331,7 +332,7 @@ void IStorageCnchFile::checkAlterSettings(const AlterCommands & commands) const Strings IStorageCnchFile::getPrunedFiles(const ContextPtr & query_context, const ASTPtr & query) { - Strings total_files = readFileList(); + Strings total_files = readFileList(query_context); if (query && virtual_header) { diff --git a/src/Storages/RemoteFile/IStorageCnchFile.h b/src/Storages/RemoteFile/IStorageCnchFile.h index a1734910e52..938dacdbf2c 100644 --- a/src/Storages/RemoteFile/IStorageCnchFile.h +++ b/src/Storages/RemoteFile/IStorageCnchFile.h @@ -45,7 +45,9 @@ class IStorageCnchFile : public IStorage, size_t max_block_size, unsigned num_streams) override; - virtual Strings readFileList() = 0; + virtual Strings readFileList(ContextPtr query_context) = 0; + + virtual void clear(ContextPtr query_context) = 0; /// read remote file parts by server local, not send resource to worker virtual void readByLocal( diff --git a/src/Storages/RemoteFile/StorageCloudHDFS.cpp b/src/Storages/RemoteFile/StorageCloudHDFS.cpp index c0c8a6c3e18..09ffa8ef7fb 100644 --- a/src/Storages/RemoteFile/StorageCloudHDFS.cpp +++ b/src/Storages/RemoteFile/StorageCloudHDFS.cpp @@ -49,7 +49,7 @@ std::unique_ptr StorageCloudHDFS::FileBufferClient::createReadBuffer std::unique_ptr StorageCloudHDFS::FileBufferClient::createWriteBuffer(const DB::String & file) { - return std::make_unique(file, query_context->getHdfsConnectionParams()); + return std::make_unique(file, query_context->getHdfsConnectionParams(), DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY, query_context->getSettingsRef().overwrite_current_file); } bool StorageCloudHDFS::FileBufferClient::exist(const DB::String & file) diff --git a/src/Storages/RemoteFile/StorageCloudHDFS.h b/src/Storages/RemoteFile/StorageCloudHDFS.h index 592e40b8d15..15e69e744c8 100644 --- a/src/Storages/RemoteFile/StorageCloudHDFS.h +++ b/src/Storages/RemoteFile/StorageCloudHDFS.h @@ -44,7 +44,7 @@ class StorageCloudHDFS : public shared_ptr_helper, public ISto ~StorageCloudHDFS() override = default; StorageCloudHDFS( - ContextMutablePtr context_, + ContextPtr context_, const StorageID & table_id_, const ColumnsDescription & required_columns_, const ConstraintsDescription & constraints_, diff --git a/src/Storages/RemoteFile/StorageCnchHDFS.cpp b/src/Storages/RemoteFile/StorageCnchHDFS.cpp index ed27c754225..384d6d5ef3d 100644 --- a/src/Storages/RemoteFile/StorageCnchHDFS.cpp +++ b/src/Storages/RemoteFile/StorageCnchHDFS.cpp @@ -1,3 +1,4 @@ +#include #include #if USE_HDFS @@ -30,6 +31,7 @@ # include # include # include +# include namespace DB { @@ -115,11 +117,45 @@ Strings ListFiles(const ContextPtr & context, const Strings & uris) return results; } -Strings StorageCnchHDFS::readFileList() +Strings StorageCnchHDFS::readFileList(ContextPtr query_context) { if (arguments.is_glob_path) - return ListFilesWithGlobs(getContext(), FileURI(arguments.url), {}); - return ListFiles(getContext(), file_list); + return ListFilesWithGlobs(query_context, FileURI(arguments.url), {}); + return ListFiles(query_context, file_list); +} + +void StorageCnchHDFS::clear(ContextPtr query_context) { + HDFSURI file(arguments.url); + + Poco::URI poco_uri(file.host_name); + HDFSBuilderPtr builder = query_context->getHdfsConnectionParams().createBuilder(poco_uri); + auto fs = createHDFSFS(builder.get()); + if (arguments.url.find(PartitionedBlockOutputStream::PARTITION_ID_WILDCARD) != String::npos) + { + // hdfsExists()=0 means exit + if (hdfsExists(fs.get(), file.dir_path.c_str())) + { + LOG_TRACE(log, "Skip clear the {} not exist dir {}", getStorageID().getNameForLogs(), file.dir_path); + return; + } + + HDFSFileInfo ls; + ls.file_info = hdfsListDirectory(fs.get(), file.dir_path.c_str(), &ls.length); + if (!ls.file_info->mSize) + { + LOG_TRACE(log, "Skip clear the {} empty dir {}", getStorageID().getNameForLogs(), file.dir_path); + return; + } + + if (!hdfsDelete(fs.get(), file.dir_path.c_str(), true)) + { + LOG_WARNING(log, "You now clear the {} dir {}", getStorageID().getNameForLogs(), file.dir_path); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to delete the {} dir {}, error: {}", getStorageID().getNameForLogs(), file.dir_path, hdfsGetLastError()); + } + } } void StorageCnchHDFS::readByLocal( @@ -134,7 +170,7 @@ void StorageCnchHDFS::readByLocal( unsigned num_streams) { auto storage = StorageCloudHDFS::create( - getContext(), + query_context, getStorageID(), storage_snapshot->metadata->getColumns(), storage_snapshot->metadata->getConstraints(), @@ -155,7 +191,7 @@ BlockOutputStreamPtr StorageCnchHDFS::write(const ASTPtr & query, const StorageM BlockOutputStreamPtr StorageCnchHDFS::writeByLocal(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) { - auto storage = StorageCloudHDFS::create(getContext(), getStorageID(), metadata_snapshot->getColumns(), metadata_snapshot->getConstraints(), file_list, metadata_snapshot->getSettingsChanges(), arguments, settings); + auto storage = StorageCloudHDFS::create(query_context, getStorageID(), metadata_snapshot->getColumns(), metadata_snapshot->getConstraints(), file_list, metadata_snapshot->getSettingsChanges(), arguments, settings); auto streams = storage->write(query, metadata_snapshot, query_context); /// todo(jiashuo): insert new file and update the new file list in cache // file_list = storage->file_list; diff --git a/src/Storages/RemoteFile/StorageCnchHDFS.h b/src/Storages/RemoteFile/StorageCnchHDFS.h index 5ef05231845..5049b832ba3 100644 --- a/src/Storages/RemoteFile/StorageCnchHDFS.h +++ b/src/Storages/RemoteFile/StorageCnchHDFS.h @@ -15,7 +15,8 @@ namespace DB class StorageCnchHDFS : public shared_ptr_helper, public IStorageCnchFile { public: - Strings readFileList() override; + Strings readFileList(ContextPtr query_context) override; + void clear(ContextPtr query_context) override; /// read hdfs file parts by server local, not send resource to worker virtual void readByLocal( diff --git a/src/Storages/RemoteFile/StorageCnchS3.cpp b/src/Storages/RemoteFile/StorageCnchS3.cpp index 50e9801ba69..79785979d73 100644 --- a/src/Storages/RemoteFile/StorageCnchS3.cpp +++ b/src/Storages/RemoteFile/StorageCnchS3.cpp @@ -1,5 +1,6 @@ #include -#include "common/types.h" +#include +#include #include #if USE_AWS_S3 @@ -32,6 +33,7 @@ # include # include # include +# include namespace ProfileEvents { @@ -188,13 +190,23 @@ void StorageCnchS3::readByLocal( return storage->read(query_plan, column_names, storage_snapshot, query_info, query_context, processed_stage, max_block_size, num_streams); } -Strings StorageCnchS3::readFileList() +Strings StorageCnchS3::readFileList(ContextPtr) { if (arguments.is_glob_path) return ListKeysWithRegexpMatching(s3_util->getClient(), s3_uri); return file_list; } +void StorageCnchS3::clear(ContextPtr) +{ + if (s3_uri.key.find(PartitionedBlockOutputStream::PARTITION_ID_WILDCARD) != String::npos) + { + auto key_prefix = s3_uri.key.substr(0, s3_uri.key.find_first_of("*?{")); + s3_util->deleteObjectsWithPrefix(key_prefix, [](const S3::S3Util &, const String &){return true;}); + LOG_WARNING(log, "You now clear the {} dir {}", getStorageID().getNameForLogs(), s3_uri.toString()); + } +} + BlockOutputStreamPtr StorageCnchS3::writeByLocal(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) { diff --git a/src/Storages/RemoteFile/StorageCnchS3.h b/src/Storages/RemoteFile/StorageCnchS3.h index 5687c0da5de..f829c998a73 100644 --- a/src/Storages/RemoteFile/StorageCnchS3.h +++ b/src/Storages/RemoteFile/StorageCnchS3.h @@ -20,7 +20,9 @@ S3ClientPtr initializeS3Client(const ContextPtr & ctx, const CnchFileArguments & class StorageCnchS3 : public shared_ptr_helper, public IStorageCnchFile { public: - Strings readFileList() override; + Strings readFileList(ContextPtr query_context) override; + + void clear(ContextPtr query_context) override; /// read s3 file by server local, not send resource to worker void readByLocal( diff --git a/src/Storages/System/StorageSystemNexusFS.cpp b/src/Storages/System/StorageSystemNexusFS.cpp new file mode 100644 index 00000000000..c3f535e9ece --- /dev/null +++ b/src/Storages/System/StorageSystemNexusFS.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemNexusFS::getNamesAndTypes() +{ + return { + {"sub_file_path", std::make_shared()}, + {"total_size", std::make_shared()}, + {"cached_size", std::make_shared()}, + {"total_segments", std::make_shared()}, + {"cached_segments", std::make_shared()}, + }; +} + + +StorageSystemNexusFS::StorageSystemNexusFS(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +void StorageSystemNexusFS::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + auto nexus_fs = context->getNexusFS(); + if (!nexus_fs) + return; + auto files = nexus_fs->getFileCachedStates(); + for (const auto & file : files) + { + res_columns[0]->insert(file.file_path); + res_columns[1]->insert(file.total_size); + res_columns[2]->insert(file.cached_size); + res_columns[3]->insert(file.total_segments); + res_columns[4]->insert(file.cached_segments); + } +} + +} diff --git a/src/Storages/System/StorageSystemNexusFS.h b/src/Storages/System/StorageSystemNexusFS.h new file mode 100644 index 00000000000..8bd491456ce --- /dev/null +++ b/src/Storages/System/StorageSystemNexusFS.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class NexusFS; +class Context; + + +/** Implements system table asynchronous_metrics, which allows to get values of periodically (asynchronously) updated metrics. + */ +class StorageSystemNexusFS final : public shared_ptr_helper, + public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "SystemNexusFS"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: +#if defined(ARCADIA_BUILD) + StorageSystemNexusFS(const String & name_,) + : StorageSystemNexusFS(StorageID{"system", name_}) + { + } +#endif + explicit StorageSystemNexusFS(const StorageID & table_id_); + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index f24fda1408e..b46ef12ffc8 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,7 @@ void attachSystemTablesLocal(IDatabase & system_database) #endif attach(system_database, "io_schedulers"); attach(system_database, "io_workers"); + attach(system_database, "nexus_fs"); } diff --git a/tests/queries/4_cnch_stateless/00474_readonly_settings.sh b/tests/queries/4_cnch_stateless/00474_readonly_settings.sh index ed09fd5ccf8..225882b1066 100755 --- a/tests/queries/4_cnch_stateless/00474_readonly_settings.sh +++ b/tests/queries/4_cnch_stateless/00474_readonly_settings.sh @@ -22,6 +22,7 @@ RAW_URL=`echo "${RAW_URL}" | sed "s@enable_optimizer_fallback=1&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@enable_optimizer_fallback=0&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@bsp_mode=1&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@tenant_id=1234&@@g"` +RAW_URL=`echo "${RAW_URL}" | sed "s@enable_nexus_fs=1&@@g"` ${CLICKHOUSE_CURL} -sS "${RAW_URL}&session_id=readonly&session_timeout=3600" -d 'SET readonly = 1' ${CLICKHOUSE_CURL} -sS "${RAW_URL}&session_id=readonly&query=SELECT+toUInt64(pow(2,+63))+as+value+format+JSON&output_format_json_quote_64bit_integers=1" 2>&1 | grep -o "value\|Cannot modify 'output_format_json_quote_64bit_integers' setting in readonly mode" diff --git a/tests/queries/4_cnch_stateless/00943_test_bitmap_index.reference b/tests/queries/4_cnch_stateless/00943_test_bitmap_index.reference index 0a72880b728..b897201e3c4 100644 --- a/tests/queries/4_cnch_stateless/00943_test_bitmap_index.reference +++ b/tests/queries/4_cnch_stateless/00943_test_bitmap_index.reference @@ -78,3 +78,6 @@ variadic mixed 2 1 1 +0 +0 +1 diff --git a/tests/queries/4_cnch_stateless/00943_test_bitmap_index.sql b/tests/queries/4_cnch_stateless/00943_test_bitmap_index.sql index 06a93cb93e6..86ea1f8b4fb 100644 --- a/tests/queries/4_cnch_stateless/00943_test_bitmap_index.sql +++ b/tests/queries/4_cnch_stateless/00943_test_bitmap_index.sql @@ -104,3 +104,8 @@ select count() from test_bitmap_index where (arraySetCheck(int_vid, 1) and array select count() from test_bitmap_index where (arraySetCheck(int_vid, 1) and arraySetCheck(float_vid, 2)) or arraySetCheck(int_vid, 2); -- 1 drop table if exists test_bitmap_index; + +-- test arraySetCheck without bitmap index +SELECT arraySetCheck([], []); +SELECT arraySetCheck([], [1,2]); +SELECT arraySetCheck([1], [1,2]); diff --git a/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.reference b/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.reference new file mode 100644 index 00000000000..379e6d0ef85 --- /dev/null +++ b/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.reference @@ -0,0 +1,25 @@ +1. test different aggregate +3 2 1 1 1 1 1 1 +4 3 1 2 1 1 1 1 +5 4 2 3 1 1 1 2 +3 2 1 1 1 1 1 1 +4 3 1 2 1 1 1 1 +5 4 2 3 1 1 1 2 +2. partition filter +4 1 +4 1 +3. partition filter with no data +3 1 +3 1 +3. no aggregate +1 2 +2 3 +3 4 +4 4 +1 2 +2 3 +3 4 +4 4 +4. no aggregate with filter +4 +4 diff --git a/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.sql b/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.sql new file mode 100644 index 00000000000..05015990457 --- /dev/null +++ b/tests/queries/4_cnch_stateless/00982_materialized_view_match_rewrite_with_simple_mv.sql @@ -0,0 +1,155 @@ +drop table if exists events; +CREATE TABLE events +( + `app_id` UInt32, + `app_name` String, + `device_id` String, + `content` String, + `ab_version` Array(Int32) BLOOM, + `string_params` Map(String, String) +) +ENGINE = CnchMergeTree +PARTITION BY (app_id, app_name) +ORDER BY (app_id, app_name, device_id); + +insert into events values ('1', '2', '3', '{}', [], {}); + +drop table if exists mv_events_target; +CREATE TABLE mv_events_target +( + `app_id` UInt32, + `app_name` String, + `device_id` String, + `content` String, + `ab_version` Array(Int32) BLOOM, + `string_params` Map(String, String), + `agent_id` String +) +ENGINE = CnchMergeTree +PARTITION BY (app_id, app_name) +ORDER BY (app_id, app_name, device_id); + +CREATE MATERIALIZED VIEW mv_events TO mv_events_target +( + `app_id` UInt32, + `app_name` String, + `device_id` String, + `content` String, + `ab_version` Array(Int32) BLOOM, + `string_params` Map(String, String), + `agent_id` String +) AS +SELECT + app_id, + app_name, + device_id, + content, + ab_version, + string_params, + JSONExtractString(content, '$.string_params.agent_id') AS agent_id +FROM events; + +insert into events values ('2', '3', '4', '{}', [], {}); +insert into events values ('3', '4', '5', '{}', [], {}); +insert into events values ('4', '4', '5', '{}', [], {}); + +select '1. test different aggregate'; +SELECT + device_id, + app_name, + count(1), + min(app_id), + countDistinct(JSONExtractString(content, '$.string_params.agent_id')), + countDistinct(JSONExtractString(content, '$.string_params.agent_id'), content), + finalizeAggregation(countDistinctState(JSONExtractString(content, '$.string_params.agent_id'), content)), + finalizeAggregation(countState()) +FROM events +GROUP BY device_id, app_name +ORDER BY device_id, app_name +SETTINGS enable_materialized_view_rewrite=0; + +SELECT + device_id, + app_name, + count(1), + min(app_id), + countDistinct(JSONExtractString(content, '$.string_params.agent_id')), + countDistinct(JSONExtractString(content, '$.string_params.agent_id'), content), + finalizeAggregation(countDistinctState(JSONExtractString(content, '$.string_params.agent_id'), content)), + finalizeAggregation(countState()) +FROM events +GROUP BY device_id, app_name +ORDER BY device_id, app_name +SETTINGS enforce_materialized_view_rewrite=1; + +select '2. partition filter'; +SELECT + device_id, + count(1) +FROM events +WHERE ((app_id IN (2)) AND (app_name = '3')) +GROUP BY device_id +ORDER BY device_id +SETTINGS enable_materialized_view_rewrite=0; + +SELECT + device_id, + count(1) +FROM events +WHERE ((app_id IN (2)) AND (app_name = '3')) +GROUP BY device_id +ORDER BY device_id +SETTINGS enforce_materialized_view_rewrite=1; + +select '3. partition filter with no data'; +SELECT + device_id, + count(1) +FROM events +WHERE ((app_id IN (1)) AND (app_name = '2')) +GROUP BY device_id +ORDER BY device_id +SETTINGS enable_materialized_view_rewrite=0; + +SELECT + device_id, + count(1) +FROM events +WHERE ((app_id IN (1)) AND (app_name = '2')) +GROUP BY device_id +ORDER BY device_id +SETTINGS enforce_materialized_view_rewrite=1; + +select '3. no aggregate'; +SELECT + app_id, + app_name, + JSONExtractString(content, '$.string_params.agent_id') +FROM events +ORDER BY app_id, app_name +SETTINGS enable_materialized_view_rewrite=0; + +SELECT + app_id, + app_name, + JSONExtractString(content, '$.string_params.agent_id') +FROM events +ORDER BY app_id, app_name +SETTINGS enforce_materialized_view_rewrite=1; + +select '4. no aggregate with filter'; +SELECT + device_id, + JSONExtractString(content, '$.string_params.agent_id') +FROM events +WHERE ((app_id IN (2)) AND (app_name = '3')) +ORDER BY device_id +SETTINGS enable_materialized_view_rewrite=0; + +SELECT + device_id, + JSONExtractString(content, '$.string_params.agent_id') +FROM events +WHERE ((app_id IN (2)) AND (app_name = '3')) +ORDER BY device_id +SETTINGS enforce_materialized_view_rewrite=1; \ No newline at end of file diff --git a/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.reference b/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.sql b/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.sql new file mode 100644 index 00000000000..384d85de9f0 --- /dev/null +++ b/tests/queries/4_cnch_stateless/02032_short_circuit_least_greatest_bug.sql @@ -0,0 +1,2 @@ +select 1 and greatest(toInt64(number % 2), toInt64(number % 3)) from numbers(1); +select 1 and least(toInt64(number % 2), toInt64(number % 3)) from numbers(1); diff --git a/tests/queries/4_cnch_stateless/10085_send_resources.reference b/tests/queries/4_cnch_stateless/10085_send_resources.reference new file mode 100644 index 00000000000..382a28ab6d9 --- /dev/null +++ b/tests/queries/4_cnch_stateless/10085_send_resources.reference @@ -0,0 +1,20 @@ +test wo/ catalog_enable_multiple_threads +1 10 +2 20 +3 30 +4 40 +5 50 +6 60 +7 70 +8 80 +9 90 +test w/ catalog_enable_multiple_threads +1 10 +2 20 +3 30 +4 40 +5 50 +6 60 +7 70 +8 80 +9 90 diff --git a/tests/queries/4_cnch_stateless/10085_send_resources.sql b/tests/queries/4_cnch_stateless/10085_send_resources.sql new file mode 100644 index 00000000000..c0c453e9d31 --- /dev/null +++ b/tests/queries/4_cnch_stateless/10085_send_resources.sql @@ -0,0 +1,22 @@ +-- test catalog_enable_multiple_threads + +drop table if exists t10085; +create table if not exists t10085 (a Int32, b Int32) engine = CnchMergeTree order by a; + +-- write 9 parts +system stop merges t10085; +insert into t10085 values (1, 10); +insert into t10085 values (2, 20); +insert into t10085 values (3, 30); +insert into t10085 values (4, 40); +insert into t10085 values (5, 50); +insert into t10085 values (6, 60); +insert into t10085 values (7, 70); +insert into t10085 values (8, 80); +insert into t10085 values (9, 90); + +select 'test wo/ catalog_enable_multiple_threads'; +select * from t10085 order by a settings catalog_enable_multiple_threads = 0, max_threads=4; +select 'test w/ catalog_enable_multiple_threads'; +select * from t10085 order by a settings catalog_enable_multiple_threads = 1, max_threads=4; +drop table if exists t10085; diff --git a/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql b/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql index ee315bf5a56..f57eccc8e60 100644 --- a/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql +++ b/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql @@ -1,7 +1,7 @@ USE test; set bsp_max_retry_num=0; -- disable bsp retry DROP TABLE IF EXISTS at_dc; -CREATE TABLE at_dc(a UInt32, p UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p; +CREATE TABLE at_dc(a UInt32, p UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p SETTINGS enable_nexus_fs = 0; INSERT INTO at_dc VALUES (1, 1), (2, 1), (3, 1); INSERT INTO at_dc VALUES (4, 2), (5, 2), (6, 2); @@ -23,7 +23,7 @@ SELECT a FROM at_dc WHERE p = 1 ORDER BY a SETTINGS disk_cache_mode = 'FORCE_DIS DROP TABLE at_dc; DROP TABLE IF EXISTS test_bucket_preload; -CREATE TABLE test_bucket_preload(a UInt32, p UInt32, c UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p CLUSTER BY c INTO 3 BUCKETS SETTINGS parts_preload_level = 1; +CREATE TABLE test_bucket_preload(a UInt32, p UInt32, c UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p CLUSTER BY c INTO 3 BUCKETS SETTINGS parts_preload_level = 1, enable_nexus_fs = 0; INSERT INTO test_bucket_preload SELECT number, 1, number % 7 FROM numbers(10); SELECT '---bucket---'; @@ -39,7 +39,7 @@ DROP TABLE test_bucket_preload; DROP TABLE IF EXISTS 11009_alter_disk_cache; SELECT '---all segments stores in single compressed block---'; -CREATE TABLE 11009_alter_disk_cache (d Decimal(4, 3)) ENGINE = CnchMergeTree ORDER BY d SETTINGS index_granularity = 1, parts_preload_level = 1; +CREATE TABLE 11009_alter_disk_cache (d Decimal(4, 3)) ENGINE = CnchMergeTree ORDER BY d SETTINGS index_granularity = 1, parts_preload_level = 1, enable_nexus_fs = 0; INSERT INTO 11009_alter_disk_cache SELECT toDecimal64(number, 3) FROM numbers(10000); ALTER DISK CACHE PRELOAD TABLE test.11009_alter_disk_cache SYNC SETTINGS parts_preload_level = 3; SELECT d FROM 11009_alter_disk_cache WHERE toFloat64(d) = 7777.0 settings disk_cache_mode = 'FORCE_DISK_CACHE'; diff --git a/tests/queries/4_cnch_stateless/46006_optimize_trivial_count.sql b/tests/queries/4_cnch_stateless/46006_optimize_trivial_count.sql index 85a63089cba..8ea0f92c102 100644 --- a/tests/queries/4_cnch_stateless/46006_optimize_trivial_count.sql +++ b/tests/queries/4_cnch_stateless/46006_optimize_trivial_count.sql @@ -104,4 +104,10 @@ UNION ALL SELECT count(*) FROM test46006; +drop table if exists test46006_1; +create table test46006_1(i int) ENGINE = CnchMergeTree() partition by i order by i; +insert into test46006_1 select count() from test46006; +insert into test46006_1 select count() from test46006 where i<10; +drop table if exists test46006_1; + drop table if exists test46006; diff --git a/tests/queries/4_cnch_stateless/48016_plan_cache.reference b/tests/queries/4_cnch_stateless/48016_plan_cache.reference index bebe8690cf7..e2dc5fe6390 100644 --- a/tests/queries/4_cnch_stateless/48016_plan_cache.reference +++ b/tests/queries/4_cnch_stateless/48016_plan_cache.reference @@ -28,3 +28,13 @@ 2 3 4 +2 +3 +3 +4 +4 +4 +4 +5 +5 +6 diff --git a/tests/queries/4_cnch_stateless/48016_plan_cache.sql b/tests/queries/4_cnch_stateless/48016_plan_cache.sql index d93381f891d..8108c92cf4b 100644 --- a/tests/queries/4_cnch_stateless/48016_plan_cache.sql +++ b/tests/queries/4_cnch_stateless/48016_plan_cache.sql @@ -45,5 +45,17 @@ from where t1.a > 5 )order by a; +select * from cache format Null; +ALTER TABLE cache drop column c; +select * from cache format Null; +select c1.a from cache c1, cache2 c2 where c1.a=c2.b order by c1.a; + +ALTER TABLE cache ADD column c Nullable(UInt64); +select * from cache format Null; + +select count() from cache2; +insert into cache2 values(1,2,3); +select count() from cache2; + DROP TABLE IF EXISTS cache; DROP TABLE IF EXISTS cache2; diff --git a/tests/queries/4_cnch_stateless/48019_execute_multi_subquery.sql b/tests/queries/4_cnch_stateless/48019_execute_multi_subquery.sql index c104835a395..4607ec6ccae 100644 --- a/tests/queries/4_cnch_stateless/48019_execute_multi_subquery.sql +++ b/tests/queries/4_cnch_stateless/48019_execute_multi_subquery.sql @@ -1,7 +1,6 @@ set dialect_type='ANSI'; set data_type_default_nullable=false; set enable_optimizer_for_create_select=1; -set fallback_use_cnch_catalog = 'NONE'; DROP TABLE if exists multi_subquery_source; DROP TABLE if exists multi_subquery_target; diff --git a/tests/queries/4_cnch_stateless/51001_index_col.sql b/tests/queries/4_cnch_stateless/51001_index_col.sql index 729dff7e0bc..2172ea803f4 100644 --- a/tests/queries/4_cnch_stateless/51001_index_col.sql +++ b/tests/queries/4_cnch_stateless/51001_index_col.sql @@ -13,7 +13,7 @@ create table test.multi_index_table ENGINE = CnchMergeTree PARTITION BY toStartOfInterval(ts, toIntervalHour(12)) ORDER BY ts -SETTINGS index_granularity = 8; +SETTINGS index_granularity = 8, enable_nexus_fs = 0; insert into table test.multi_index_table values ('2023-10-17 00:11:58.996', 'preload_test1', 'preload_test2', [1, 2, 3, 4, 5]) @@ -38,7 +38,7 @@ create table test.multi_index_table ENGINE = CnchMergeTree PARTITION BY toStartOfInterval(ts, toIntervalHour(12)) ORDER BY ts -SETTINGS index_granularity = 8; +SETTINGS index_granularity = 8, enable_nexus_fs = 0; insert into table test.multi_index_table values ('2022-10-17 00:11:58.996', 'preload_test1', 'preload_test2', [1, 2, 3, 4, 5]) diff --git a/tests/queries/4_cnch_stateless/71000_bucket_shuffle_join.sql b/tests/queries/4_cnch_stateless/71000_bucket_shuffle_join.sql index 61095b96ef6..9d4bad245fe 100644 --- a/tests/queries/4_cnch_stateless/71000_bucket_shuffle_join.sql +++ b/tests/queries/4_cnch_stateless/71000_bucket_shuffle_join.sql @@ -33,7 +33,7 @@ SET bsp_mode=0; -- bsp mode does not support bucket join SET enum_replicate_no_stats=0,enable_bucket_shuffle=1; SELECT 'enable bucket shuffle'; SELECT 'dtspartition'; -SELECT * FROM bucket_dtspartition b LEFT JOIN normal n ON b.d=n.b ORDER BY d; +SELECT * FROM bucket_dtspartition b LEFT JOIN normal n ON b.d=n.b ORDER BY d SETTINGS enable_internal_communication_user=0; SELECT * FROM bucket_dtspartition b Right JOIN normal n ON b.d=n.b ORDER BY b; SELECT * FROM bucket_dtspartition_with_range b LEFT JOIN normal n ON b.d=n.b ORDER BY d; SELECT * FROM bucket_dtspartition_with_range b RIGHT JOIN normal n ON b.d=n.b ORDER BY b; diff --git a/tests/queries/7_hive/00011_external_table_inline_expression.reference b/tests/queries/7_hive/00011_external_table_inline_expression.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/7_hive/00011_external_table_inline_expression.sh b/tests/queries/7_hive/00011_external_table_inline_expression.sh new file mode 100755 index 00000000000..49e9670896e --- /dev/null +++ b/tests/queries/7_hive/00011_external_table_inline_expression.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -e + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh +. "$CURDIR"/setup_env.sh + +hive-cli "create database if not exists ${CLICKHOUSE_DATABASE}; \ +create table if not exists ${CLICKHOUSE_DATABASE}.t00011 (s1 string , s2 string, arr1 array) stored as parquet ;" + +${CLICKHOUSE_CLIENT} --query "create table if not exists hive_t00011 (s1 String, s2 String, arr1 Array(Int64)) engine = CnchHive('${HIVE_METASTORE}', '${CLICKHOUSE_DATABASE}', 't00011')" +( ${CLICKHOUSE_CLIENT} --query "explain select (lower(upper(substring(s1 || s2, 2)))) as a, count() from hive_t00011 prewhere length(a) > 10 group by a settings enable_optimizer=1, enable_common_expression_sharing = 1, enable_common_expression_sharing_for_prewhere = 1" | grep -ie 'Inline expression' ) && echo "fail" + +exit 0 diff --git a/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.reference b/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.reference new file mode 100644 index 00000000000..9a1821c6cab --- /dev/null +++ b/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.reference @@ -0,0 +1,13 @@ +1 1 +2 1 +2 2 +1 +31 1 +32 2 +2 +4 1 +4 2 +4 3 +4 4 +4 1 +4 2 diff --git a/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.sql b/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.sql new file mode 100644 index 00000000000..3e67135d0ef --- /dev/null +++ b/tests/queries/8_cnch_S3_only/70000_cnch_s3_file.sql @@ -0,0 +1,66 @@ +drop table if exists ext_s3_table_1; +create table ext_s3_table_1 +( + `k` Int32, + `m` Int32 +) +engine = CnchS3("http://minio:9000/cnch/test_s3_ext_table/test_1.csv", 'CSV', 'none', 'minio', 'minio123'); + +set overwrite_current_file = 1; +insert into ext_s3_table_1 values (1, 1); +select * from ext_s3_table_1; -- 1 1 + +drop table if exists ext_s3_table_2; +create table ext_s3_table_2 +( + `k` Int32, + `m` Int32 +) +engine = CnchS3("http://minio:9000/cnch/test_s3_ext_table/partition_*_test_2.csv", 'CSV', 'none', 'minio', 'minio123'); + +insert into ext_s3_table_2 values (2, 1)(2, 2); +select * from ext_s3_table_2; -- 2 1; 2 2 +select count() from (select count(*) from ext_s3_table_2 group by _path); -- 1 + +set insert_new_file = 0; +set overwrite_current_file = 0; +insert into ext_s3_table_2 values (2, 1)(2, 2); --{serverError 36} + +drop table if exists ext_s3_table_3; +create table ext_s3_table_3 +( + `k` Int32, + `m` Int32 +) +engine = CnchS3("http://minio:9000/cnch/test_s3_ext_table/partition_*_test_3.csv", 'CSV', 'none', 'minio', 'minio123') partition by k; + +set overwrite_current_file = 1; +insert into ext_s3_table_3 values (31, 1)(32, 2); +select * from ext_s3_table_3 order by k; -- 31 1; 32 2 +select count(*) from ext_s3_table_2 group by _path; -- 2 + +drop table if exists s3_ext_table_source; +create table s3_ext_table_source +( + `k` Int32, + `m` Int32 +) +engine = CnchMergeTree +partition by k +order by m; + +drop table if exists ext_s3_table_4; +create table ext_s3_table_4 +( + `k` Int32, + `m` Int32 +) +engine =CnchS3("http://minio:9000/cnch/test_s3_ext_table/partition_*_test_4.csv", 'CSV', 'none', 'minio', 'minio123'); + +insert into s3_ext_table_source values (4, 1)(4, 2); +select * from s3_ext_table_source; -- 4 1; 4 2 + +insert into ext_s3_table_4 values (4, 3)(4, 4); +select * from ext_s3_table_4; -- 4 3; 4 4 +insert overwrite ext_s3_table_4 select * from s3_ext_table_source; +select * from ext_s3_table_4; -- 4 1; 4 2