diff --git a/docker/CI/docker-compose-nexusfs.yml b/docker/CI/docker-compose-nexusfs.yml new file mode 100644 index 0000000000..c4dda2deba --- /dev/null +++ b/docker/CI/docker-compose-nexusfs.yml @@ -0,0 +1,174 @@ +version: "3" + +services: + # After upgrade to docker-compose v2, we could use `include` instead of `extend`. + hdfs-namenode: + extends: + file: ./common/hdfs.yml + service: hdfs-namenode + hdfs-datanode: + extends: + file: ./common/hdfs.yml + service: hdfs-datanode + fdb: + extends: + file: ./common/fdb.yml + service: fdb + my_mysql: + extends: + file: ./common/mysql.yml + service: my_mysql + tso: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "fdbcli -C /config/fdb.cluster --exec \"configure new single ssd\"; tso-server --config-file /config/tso.yml" + depends_on: + - fdb + - hdfs-namenode + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/tso/:/var/log/byconity/:rw + environment: &env + LD_LIBRARY_PATH: /opt/byconity/lib + PATH: /opt/byconity/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin + ASAN_OPTIONS: + TSAN_OPTIONS: + IS_CI_ENV: 1 + CI_PIPELINE_NAME: CI + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:18845"] + interval: 5s + + server-0: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)" + depends_on: + tso: + condition: service_healthy + ports: + - "9000:52145" + - "127.0.0.1:8123:21557" + - "127.0.0.1:9004:9004" + environment: + <<: *env + SERVER_ID: server-0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/server-0/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:21557"] + interval: 5s + + server-1: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/server.yml & clickhouse-server --config-file /config/server.yml)" + depends_on: + tso: + condition: service_healthy + ports: + - "9001:52145" + - "127.0.0.1:8124:21557" + environment: + <<: *env + SERVER_ID: server-1 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/server-1/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + healthcheck: + test: ["CMD", "curl", "localhost:52145"] + interval: 5s + + worker-write: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "clickhouse-server --config-file /config/worker.yml" + depends_on: + - server-0 + - server-1 + ports: + - "52149:52145" + environment: + <<: *env + WORKER_GROUP_ID: wg_write + VIRTUAL_WAREHOUSE_ID: vw_write + WORKER_ID: w0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/worker-write/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + worker-default: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "(udf-manager --config-file /config/worker.yml & clickhouse-server --config-file /config/worker.yml)" + depends_on: + - server-0 + - server-1 + environment: + <<: *env + WORKER_GROUP_ID: wg_default + VIRTUAL_WAREHOUSE_ID: vw_default + WORKER_ID: r0 + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/worker-default/:/var/log/byconity/:rw + - ./queries/:/opt/byconity/queries/:ro + cap_add: + - SYS_PTRACE + daemon-manager: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "daemon-manager --config-file ./config/daemon-manager.yml" + depends_on: + server-0: + condition: service_healthy + server-1: + condition: service_healthy + environment: + <<: *env + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/daemon-manager/:/var/log/byconity/:rw + cap_add: + - SYS_PTRACE + restart: always + + resource-manager: + image: hub.byted.org/bytehouse/debian.bullseye.fdb.udf:0.1 + command: bash -c "resource-manager --config-file /config/resource-manager.yml" + depends_on: + - tso + volumes: + - ${CNCH_BINARY_PATH}/:/opt/byconity/bin/:ro + - ${CNCH_LIBRARY_PATH}/:/opt/byconity/lib/:ro + - ./nexusfs/:/config/:ro + - ./test_output/rm/:/var/log/byconity/:rw + environment: + <<: *env + cap_add: + - SYS_PTRACE + +volumes: + fdb-data: + external: false + hdfs-namenode: + external: false + hdfs-datanode: + external: false diff --git a/docker/CI/nexusfs/conf.d/catalog.yml b/docker/CI/nexusfs/conf.d/catalog.yml new file mode 100644 index 0000000000..7ddd723187 --- /dev/null +++ b/docker/CI/nexusfs/conf.d/catalog.yml @@ -0,0 +1,6 @@ +catalog: + name_space: default +catalog_service: + type: fdb + fdb: + cluster_file: /config/fdb.cluster diff --git a/docker/CI/nexusfs/conf.d/service_discovery.yml b/docker/CI/nexusfs/conf.d/service_discovery.yml new file mode 100644 index 0000000000..7627487161 --- /dev/null +++ b/docker/CI/nexusfs/conf.d/service_discovery.yml @@ -0,0 +1,115 @@ +service_discovery: + mode: local + cluster: default + disable_cache: false + cache_timeout: 5 + server: + psm: data.cnch.server + node: + - host: server-0 + hostname: server-0 + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + - host: server-1 + hostname: server-1 + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + tso: + psm: data.cnch.tso + node: + host: tso + hostname: tso + ports: + port: + - name: PORT0 + value: 18845 + - name: PORT2 + value: 9181 + resource_manager: + psm: data.cnch.resource_manager + node: + host: resource-manager + hostname: resource-manager + ports: + port: + name: PORT0 + value: 28989 + daemon_manager: + psm: data.cnch.daemon_manager + node: + host: daemon-manager + hostname: daemon-manager + ports: + port: + name: PORT0 + value: 17553 + vw_psm: data.cnch.vw + vw: + psm: data.cnch.vw + node: + - host: worker-write + hostname: worker-write + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + vw_name: vw_write + - host: worker-default + hostname: worker-default + ports: + port: + - name: PORT2 + value: 21557 + - name: PORT1 + value: 30605 + - name: PORT0 + value: 52145 + - name: PORT4 + value: 27651 + - name: PORT3 + value: 45443 + - name: PORT5 + value: 47447 + - name: PORT6 + value: 60611 + vw_name: vw_default diff --git a/docker/CI/nexusfs/conf.d/storage.yml b/docker/CI/nexusfs/conf.d/storage.yml new file mode 100644 index 0000000000..020132e7b5 --- /dev/null +++ b/docker/CI/nexusfs/conf.d/storage.yml @@ -0,0 +1,18 @@ +hdfs_addr: hdfs://hdfs-namenode:9000 +storage_configuration: + disks: + hdfs_disk: + path: /user/clickhouse/ + type: bytehdfs + local_disk: + path: /var/byconity/data/ + type: local + policies: + default: + volumes: + hdfs: + default: hdfs_disk + disk: hdfs_disk + local: + default: local_disk + disk: local_disk diff --git a/docker/CI/nexusfs/daemon-manager.yml b/docker/CI/nexusfs/daemon-manager.yml new file mode 100644 index 0000000000..c4cbe3dcbf --- /dev/null +++ b/docker/CI/nexusfs/daemon-manager.yml @@ -0,0 +1,63 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +listen_host: "0.0.0.0" +cnch_type: server +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +daemon_manager: + port: 17553 + daemon_jobs: + job: + - name: PART_GC + interval: 10000 + disable: 0 + - name: PART_MERGE + interval: 10000 + disable: 0 + - name: CONSUMER + interval: 10000 + disable: 0 + - name: GLOBAL_GC + interval: 5000 + disable: 1 + - name: PART_CLUSTERING + interval: 30000 + disable: 0 + - name: DEDUP_WORKER + interval: 3000 + disable: 0 + # Increasing the frequency of recycling in a test environment + - name: TXN_GC + interval: 3000 + disable: 0 diff --git a/docker/CI/nexusfs/fdb.cluster b/docker/CI/nexusfs/fdb.cluster new file mode 100644 index 0000000000..b04f02bc3b --- /dev/null +++ b/docker/CI/nexusfs/fdb.cluster @@ -0,0 +1 @@ +docker:docker@fdb:4550 diff --git a/docker/CI/nexusfs/resource-manager.yml b/docker/CI/nexusfs/resource-manager.yml new file mode 100644 index 0000000000..b53233f1d0 --- /dev/null +++ b/docker/CI/nexusfs/resource-manager.yml @@ -0,0 +1,29 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +listen_host: "0.0.0.0" +path: /var/byconity/ +timezone: Europe/Moscow +perQuery: 1 +resource_manager: + port: 28989 + vws: + vw: + - name: vw_default + type: default + num_workers: 1 + worker_groups: + worker_group: + name: wg_default + type: Physical + - name: vw_write + type: write + num_workers: 1 + worker_groups: + worker_group: + name: wg_write + type: Physical diff --git a/docker/CI/nexusfs/server.yml b/docker/CI/nexusfs/server.yml new file mode 100644 index 0000000000..f03178bd0e --- /dev/null +++ b/docker/CI/nexusfs/server.yml @@ -0,0 +1,105 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 + console: true +additional_services: + GIS: 1 + VectorSearch: 1 + FullTextSearch: 1 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +mysql_port: 9004 +listen_host: "0.0.0.0" +prometheus: + endpoint: "/metrics" + port: 0 + metrics: true + events: true + asynchronous_metrics: true + part_metrics: false +cnch_type: server +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +enable_tenant_systemdb: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +nexus_fs: + enable: 1 + use_memory_device: 0 + enable_async_io: 0 + cache_size: 5368709120 + region_size: 4194304 + segment_size: 524288 + enable_memory_buffer: 1 + memory_buffer_size: 1073741824 + clean_regions_pool: 16 + clean_region_threads: 4 + num_in_mem_buffers: 32 + reader_threads: 32 +merge_tree: + reorganize_marks_data_layout: 1 + enable_nexus_fs: 1 +cnch_kafka_log: + database: cnch_system + table: cnch_kafka_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +cnch_unique_table_log: + database: cnch_system + table: cnch_unique_table_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +cnch_query_log: + database: cnch_system + table: cnch_query_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +query_log: + database: system + table: query_log + flush_interval_milliseconds: 15000 + partition_by: event_date +part_allocation_algorithm: 1 +consistent_hash_ring: + num_replicas: 16 + num_probes: 21 + load_factor: 1.3 +udf_path: /var/byconity/data/user_defined +udf_manager_server: + timeout_ms: 20000 + max_retry: 1 +udf_processor: + count: 3 + uds_path: /dev/shm/udf_processor_server + timeout_ms: 10000 + max_retry: 1 +custom_settings_prefixes: SQL_ +restrict_tenanted_users_to_whitelist_settings: false +restrict_tenanted_users_to_privileged_operations: false +sensitive_permission_tenants: 1234 diff --git a/docker/CI/nexusfs/tso.yml b/docker/CI/nexusfs/tso.yml new file mode 100644 index 0000000000..095eb2ebe7 --- /dev/null +++ b/docker/CI/nexusfs/tso.yml @@ -0,0 +1,22 @@ +logger: + level: trace + log: /var/log/byconity/tso.log + errorlog: /var/log/byconity/tso.err.log + testlog: /var/log/byconity/tso.test.log + size: 1000M + count: 10 + console: false +listen_host: "0.0.0.0" +path: /var/byconity/tso +tmp_path: /var/byconity/tmp +tso_service: + type: fdb + fdb: + cluster_file: /config/fdb.cluster + port: 18845 + http: + port: 9181 + receive_timeout: 1800 + send_timeout: 1800 + tso_window_ms: 3000 + tso_get_leader_info_interval_ms: 0 diff --git a/docker/CI/nexusfs/users.yml b/docker/CI/nexusfs/users.yml new file mode 100644 index 0000000000..61e2e5a63d --- /dev/null +++ b/docker/CI/nexusfs/users.yml @@ -0,0 +1,38 @@ +profiles: + default: + load_balancing: random + log_queries: 1 + max_execution_time: 180 + exchange_timeout_ms: 300000 + enable_nexus_fs: 1 + +users: + default: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + access_management: 1 + server: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + probe: + networks: + ip: ::/0 + password: "" + profile: default + quota: default + +quotas: + default: + interval: + duration: 3600 + queries: 0 + errors: 0 + result_rows: 0 + read_rows: 0 + execution_time: 0 \ No newline at end of file diff --git a/docker/CI/nexusfs/worker.yml b/docker/CI/nexusfs/worker.yml new file mode 100644 index 0000000000..a97e011eb5 --- /dev/null +++ b/docker/CI/nexusfs/worker.yml @@ -0,0 +1,82 @@ +logger: + level: trace + log: /var/log/byconity/out.log + errorlog: /var/log/byconity/err.log + testlog: /var/log/byconity/test.log + size: 1000M + count: 10 +http_port: 21557 +rpc_port: 30605 +tcp_port: 52145 +ha_tcp_port: 26247 +exchange_port: 47447 +exchange_status_port: 60611 +interserver_http_port: 30491 +listen_host: "0.0.0.0" +cnch_type: worker +vw_name: vw_default +max_connections: 4096 +keep_alive_timeout: 3 +max_concurrent_queries: 200 +uncompressed_cache_size: 8589934592 +mark_cache_size: 5368709120 +path: /var/byconity/ +tmp_path: /var/byconity/tmp_data/ +users_config: /config/users.yml +default_profile: default +default_database: default +timezone: Europe/Moscow +mlock_executable: false +enable_tenant_systemdb: false +macros: + "-incl": macros + "-optional": true +builtin_dictionaries_reload_interval: 3600 +max_session_timeout: 3600 +default_session_timeout: 60 +dictionaries_config: "*_dictionary.xml" +format_schema_path: /var/byconity/format_schemas/ +perQuery: 1 +nexus_fs: + enable: 1 + use_memory_device: 0 + enable_async_io: 0 + cache_size: 5368709120 + region_size: 4194304 + segment_size: 524288 + enable_memory_buffer: 1 + memory_buffer_size: 1073741824 + clean_regions_pool: 16 + clean_region_threads: 4 + num_in_mem_buffers: 32 + reader_threads: 32 +merge_tree: + reorganize_marks_data_layout: 1 + enable_nexus_fs: 1 +cnch_unique_table_log: + database: cnch_system + table: cnch_unique_table_log + flush_max_row_count: 10000 + flush_interval_milliseconds: 7500 +query_log: + database: system + table: query_log + flush_interval_milliseconds: 15000 + partition_by: event_date +udf_path: /var/byconity/data/user_defined +udf_manager_server: + timeout_ms: 20000 + max_retry: 1 +udf_processor: + count: 3 + uds_path: /dev/shm/udf_processor_worker + timeout_ms: 10000 + max_retry: 1 +restrict_tenanted_users_to_system_tables: false +restrict_tenanted_users_to_whitelist_settings: false +restrict_tenanted_users_to_privileged_operations: false +additional_services: + FullTextSearch: true + VectorSearch: true + GIS: true +sensitive_permission_tenants: 1234 diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 525e72e887..82210de337 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -1114,21 +1114,55 @@ M(RegionManagerNumInMemBufCleanupRetries, "RegionManager number of in-memory buffer cleanup retries") \ M(RegionManagerCleanRegionRetries, "RegionManager number of clean region retries") \ \ - M(NexusFSDiskCacheHit, "NexusFS disk cache hits") \ - M(NexusFSDiskCacheHitInflightInsert, "NexusFS disk cache hits on in-flight inserts") \ - M(NexusFSDiskCacheMiss, "NexusFS disk cache misses") \ + M(NexusFSHit, "NexusFS hits") \ + M(NexusFSHitInflightInsert, "NexusFS hits on in-flight inserts") \ + M(NexusFSMiss, "NexusFS missed") \ + M(NexusFSPreload, "NexusFS preloads") \ + M(NexusFSDeepRetry, "NexusFS deep retries") \ M(NexusFSDiskCacheEvict, "NexusFS disk cache evicts") \ - M(NexusFSDiskCachePreload, "NexusFS disk cache preloads") \ - M(NexusFSDiskCacheLookupRetries, "NexusFS disk cache retries in lookup") \ - M(NexusFSDiskCacheInsertRetries, "NexusFS disk cache retries in insert") \ + M(NexusFSDiskCacheInsertRetries, "NexusFS disk cache retries when insert") \ M(NexusFSDiskCacheError, "NexusFS disk cache errors") \ - M(NexusFSDiskCacheBytesRead, "NexusFS disk cache total bytes read") \ - M(NexusFSDiskCacheBytesWrite, "NexusFS disk cache total bytes write") \ - M(NexusFSMemoryBufferHit, "NexusFS memory buffer hits") \ - M(NexusFSMemoryBufferMiss, "NexusFS memory buffer misses") \ - M(NexusFSMemoryBufferEvict, "NexusFS memory buffer evicts") \ - M(NexusFSMemoryBufferError, "NexusFS memory buffer errors") \ - M(NexusFSMemoryBufferBytesRead, "NexusFS memory buffer total bytes read") \ + M(NexusFSDiskCacheBytesRead, "NexusFS disk cache bytes read") \ + M(NexusFSDiskCacheBytesWrite, "NexusFS disk cache bytes write") \ + M(NexusFSReadFromInsertCxt, "NexusFS ReadFromInsertCxt successes") \ + M(NexusFSReadFromInsertCxtRetry, "NexusFS ReadFromInsertCxt retries") \ + M(NexusFSReadFromInsertCxtDeepRetry, "NexusFS ReadFromInsertCxt deep retries") \ + M(NexusFSReadFromInsertCxtBytesRead, "NexusFS ReadFromInsertCxt bytes read") \ + M(NexusFSReadFromInsertCxtNonCopy, "NexusFS ReadFromInsertCxt by non-copying method successes") \ + M(NexusFSReadFromInsertCxtNonCopyBytesRead, "NexusFS ReadFromInsertCxt by non-copying method bytes read") \ + M(NexusFSReadFromDisk, "NexusFS ReadFromDisk successes") \ + M(NexusFSReadFromDiskRetry, "NexusFS ReadFromDisk retries") \ + M(NexusFSReadFromDiskDeepRetry, "NexusFS ReadFromDisk deep retries") \ + M(NexusFSReadFromDiskBytesRead, "NexusFS ReadFromDisk bytes read") \ + M(NexusFSReadFromBuffer, "NexusFS ReadFromBuffer successes") \ + M(NexusFSReadFromBufferRetry, "NexusFS ReadFromBuffer retries") \ + M(NexusFSReadFromBufferDeepRetry, "NexusFS ReadFromBuffer deep retries") \ + M(NexusFSReadFromBufferBytesRead, "NexusFS ReadFromBuffer bytes read") \ + M(NexusFSReadFromBufferNonCopy, "NexusFS ReadFromBuffer by non-copying method successes") \ + M(NexusFSReadFromBufferNonCopyBytesRead, "NexusFS ReadFromBuffer by non-copying method bytes read") \ + M(NexusFSReadFromSourceBytesRead, "NexusFS bytes read from source") \ + M(NexusFSReadFromSourceMicroseconds, "NexusFS read from source microseconds") \ + M(NexusFSTimeout, "NexusFS read timeouts") \ + M(NexusFSPrefetchToBuffer, "NexusFS PrefetchToBuffer successes") \ + M(NexusFSPrefetchToBufferBytesRead, "NexusFS PrefetchToBuffer bytes read") \ + M(NexusFSBufferHit, "NexusFS buffer hits") \ + M(NexusFSBufferMiss, "NexusFS buffer misses") \ + M(NexusFSBufferPreload, "NexusFS buffer preloads") \ + M(NexusFSBufferPreloadRetry, "NexusFS buffer retries in preload") \ + M(NexusFSBufferEmptyCoolingQueue, "NexusFS buffer cooling queue empty") \ + M(NexusFSInodeManagerLookupMicroseconds, "NexusFS InodeManager lookup microseconds") \ + M(NexusFSInodeManagerInsertMicroseconds, "NexusFS InodeManager insert microseconds") \ +\ + M(ReadFromNexusFSReadBytes, "Read bytes from nuxusfs.") \ + M(ReadFromNexusFSSeeks, "Total number of seeks for async buffer") \ + M(ReadFromNexusFSPrefetchRequests, "Number of prefetches made with asynchronous reading from nuxusfs") \ + M(ReadFromNexusFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \ + M(ReadFromNexusFSPrefetchedReads, "Number of reads from prefetched buffer") \ + M(ReadFromNexusFSPrefetchTaskWait, "Number of waiting when reading from prefetched buffer") \ + M(ReadFromNexusFSPrefetchTaskNotWait, "Number of not waiting when reading from prefetched buffer") \ + M(ReadFromNexusFSPrefetchedBytes, "Number of bytes from prefetched buffer") \ + M(ReadFromNexusFSAsynchronousWaitMicroseconds, "Time spent in waiting for asynchronous nuxusfs reads.") \ + M(ReadFromNexusFSSynchronousWaitMicroseconds, "Time spent in waiting for synchronous nuxusfs reads.") \ \ M(TSORequest, "Number requests sent to TSO") \ M(TSORequestMicroseconds, "Total time spent in get timestamp from TSO") \ @@ -1246,13 +1280,20 @@ uint64_t Counters::getIOReadTime(bool use_async_read) const if (use_async_read) { return counters[ProfileEvents::RemoteFSAsynchronousReadWaitMicroseconds] - + counters[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] + counters[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] + + counters[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds]; } // Else, we calculate the origin read IO time else { - return counters[ProfileEvents::HDFSReadElapsedMicroseconds] + counters[ProfileEvents::ReadBufferFromS3ReadMicroseconds] - + counters[ProfileEvents::DiskReadElapsedMicroseconds]; + return counters[ProfileEvents::HDFSReadElapsedMicroseconds] + + counters[ProfileEvents::ReadBufferFromS3ReadMicroseconds] + + counters[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds] + - counters_holder[ProfileEvents::NexusFSReadFromSourceMicroseconds]; } } @@ -1272,14 +1313,19 @@ uint64_t Counters::Snapshot::getIOReadTime(bool use_async_read) const { return counters_holder[ProfileEvents::RemoteFSAsynchronousReadWaitMicroseconds] + counters_holder[ProfileEvents::RemoteFSSynchronousReadWaitMicroseconds] - + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds]; } // Else, we calculate the origin read IO time else { return counters_holder[ProfileEvents::HDFSReadElapsedMicroseconds] + counters_holder[ProfileEvents::ReadBufferFromS3ReadMicroseconds] - + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds]; + + counters_holder[ProfileEvents::DiskReadElapsedMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds] + + counters_holder[ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds] + - counters_holder[ProfileEvents::NexusFSReadFromSourceMicroseconds]; } } diff --git a/src/Disks/DiskByteS3.cpp b/src/Disks/DiskByteS3.cpp index 902f63bada..53341de64c 100644 --- a/src/Disks/DiskByteS3.cpp +++ b/src/Disks/DiskByteS3.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -247,20 +247,21 @@ std::unique_ptr DiskByteS3::readFile(const String & path { ReadSettings modified_settings{settings}; modified_settings.for_disk_s3 = true; + auto nexus_fs = settings.enable_nexus_fs ? Context::getGlobalContextInstance()->getNexusFS() : nullptr; + bool use_external_buffer = nexus_fs ? false : settings.remote_fs_prefetch; std::unique_ptr impl; - { - impl = std::make_unique( - s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, settings.remote_fs_prefetch); - } + impl = std::make_unique( + s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, use_external_buffer); - if (settings.enable_nexus_fs) + if (nexus_fs) { - auto nexus_fs = Context::getGlobalContextInstance()->getNexusFS(); - if (nexus_fs) - impl = std::make_unique(nexus_fs->getSegmentSize(), std::move(impl), *nexus_fs); + impl = std::make_unique( + settings.local_fs_buffer_size, + settings.remote_fs_prefetch, + std::move(impl), + *nexus_fs); } - - if (settings.remote_fs_prefetch) + else if (settings.remote_fs_prefetch) { auto impl = std::make_unique(s3_util.getClient(), s3_util.getBucket(), object_key, modified_settings, 3, false, /* use_external_buffer */true); diff --git a/src/Disks/HDFS/DiskByteHDFS.cpp b/src/Disks/HDFS/DiskByteHDFS.cpp index 915d57deab..7c60d065b1 100644 --- a/src/Disks/HDFS/DiskByteHDFS.cpp +++ b/src/Disks/HDFS/DiskByteHDFS.cpp @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include @@ -237,18 +237,21 @@ std::unique_ptr DiskByteHDFS::readFile(const String & pa } else { + auto nexus_fs = settings.enable_nexus_fs ? Context::getGlobalContextInstance()->getNexusFS() : nullptr; + bool use_external_buffer = nexus_fs ? false : settings.remote_fs_prefetch; std::unique_ptr impl; + impl = std::make_unique( + file_path, hdfs_params, settings, nullptr, 0, use_external_buffer); - impl = std::make_unique(file_path, hdfs_params, settings, - nullptr, 0, /* use_external_buffer */ settings.remote_fs_prefetch); - if (settings.enable_nexus_fs) + if (nexus_fs) { - auto nexus_fs = Context::getGlobalContextInstance()->getNexusFS(); - if (nexus_fs) - impl = std::make_unique(nexus_fs->getSegmentSize(), std::move(impl), *nexus_fs); + impl = std::make_unique( + settings.local_fs_buffer_size, + settings.remote_fs_prefetch, + std::move(impl), + *nexus_fs); } - - if (settings.remote_fs_prefetch) + else if (settings.remote_fs_prefetch) { auto global_context = Context::getGlobalContextInstance(); auto reader = global_context->getThreadPoolReader(); diff --git a/src/IO/ReadBufferFromFileWithNexusFS.cpp b/src/IO/ReadBufferFromFileWithNexusFS.cpp deleted file mode 100644 index 55f1190900..0000000000 --- a/src/IO/ReadBufferFromFileWithNexusFS.cpp +++ /dev/null @@ -1,146 +0,0 @@ -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int SEEK_POSITION_OUT_OF_BOUND; -} - -ReadBufferFromFileWithNexusFS::ReadBufferFromFileWithNexusFS( - size_t buf_size, - std::unique_ptr source_read_buffer_, - NexusFS &nexus_fs_) - : ReadBufferFromFileBase(buf_size, nullptr, 0) - , file_name(source_read_buffer_->getFileName()) - , source_read_buffer(std::move(source_read_buffer_)) - , nexus_fs(nexus_fs_) -{ -} - -bool ReadBufferFromFileWithNexusFS::nextImpl() -{ - if (read_until_position) - { - if (read_until_position == offset) - return false; - - if (read_until_position < offset) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); - } - } - - /// If internal_buffer size is empty, then read() cannot be distinguished from EOF - chassert(!internal_buffer.empty()); - - size_t max_size_to_read = internal_buffer.size(); - if (read_until_position) - { - max_size_to_read = std::min(max_size_to_read, static_cast(read_until_position - offset)); - } - - size_t bytes_read = nexus_fs.read(file_name, offset, max_size_to_read, source_read_buffer, internal_buffer.begin()); - - if (bytes_read) - { - working_buffer = internal_buffer; - working_buffer.resize(bytes_read); - offset += bytes_read; - return true; - } - - return false; -} - -off_t ReadBufferFromFileWithNexusFS::seek(off_t offset_, int whence) -{ - if (whence == SEEK_CUR) - offset_ = getPosition() + offset_; - else if (whence != SEEK_SET) - throw Exception("Seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::BAD_ARGUMENTS); - - if (offset_ < 0) - throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND); - - if (offset_ == getPosition()) - return offset_; - - if (!working_buffer.empty() - && static_cast(offset_) >= offset - working_buffer.size() - && offset_ < offset) - { - pos = working_buffer.end() - (offset - offset_); - assert(pos >= working_buffer.begin()); - assert(pos < working_buffer.end()); - - return getPosition(); - } - - resetWorkingBuffer(); - offset = offset_; - - return offset; -} - -IAsynchronousReader::Result ReadBufferFromFileWithNexusFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) -{ - bool result = false; - offset = read_offset; - set(data, size); - - if (ignore_bytes) - { - ignore(ignore_bytes); - result = hasPendingData(); - ignore_bytes = 0; - } - - if (!result) - result = next(); - - if (result) - { - assert(available()); - return { working_buffer.size(), BufferBase::offset(), nullptr }; - } - - return {0, 0, nullptr}; -} - -size_t ReadBufferFromFileWithNexusFS::readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) -{ - if (n == 0) - return 0; - - size_t bytes_read = nexus_fs.read(file_name, range_begin, n, source_read_buffer, to); - - if (bytes_read && progress_callback) - progress_callback(bytes_read); - return bytes_read; -} - -void ReadBufferFromFileWithNexusFS::setReadUntilPosition(size_t position) -{ - if (position != static_cast(read_until_position)) - { - offset = getPosition(); - resetWorkingBuffer(); - read_until_position = position; - } -} - -void ReadBufferFromFileWithNexusFS::setReadUntilEnd() -{ - if (read_until_position) - { - offset = getPosition(); - resetWorkingBuffer(); - read_until_position = 0; - } -} - -} diff --git a/src/IO/ReadBufferFromNexusFS.cpp b/src/IO/ReadBufferFromNexusFS.cpp new file mode 100644 index 0000000000..5b9759a672 --- /dev/null +++ b/src/IO/ReadBufferFromNexusFS.cpp @@ -0,0 +1,295 @@ +#include +#include "Common/Exception.h" +#include "common/logger_useful.h" +#include + + +namespace CurrentMetrics +{ + extern const Metric AsynchronousReadWait; +} + +namespace ProfileEvents +{ + extern const Event ReadFromNexusFSReadBytes; + extern const Event ReadFromNexusFSAsynchronousWaitMicroseconds; + extern const Event ReadFromNexusFSSynchronousWaitMicroseconds; + extern const Event ReadFromNexusFSSeeks; + extern const Event ReadFromNexusFSPrefetchRequests; + extern const Event ReadFromNexusFSUnusedPrefetches; + extern const Event ReadFromNexusFSPrefetchedReads; + extern const Event ReadFromNexusFSPrefetchedBytes; + extern const Event ReadFromNexusFSPrefetchTaskWait; + extern const Event ReadFromNexusFSPrefetchTaskNotWait; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int SEEK_POSITION_OUT_OF_BOUND; +} + +ReadBufferFromNexusFS::ReadBufferFromNexusFS( + size_t buf_size_, + bool actively_prefetch_, + std::unique_ptr source_read_buffer_, + NexusFS &nexus_fs_) + : ReadBufferFromFileBase(nexus_fs_.supportNonCopyingRead() ? 0 : buf_size_, nullptr, 0) + , file_name(source_read_buffer_->getFileName()) + , source_read_buffer(std::move(source_read_buffer_)) + , nexus_fs(nexus_fs_) + , buf_size(buf_size_) + , read_to_internal_buffer(!nexus_fs_.supportNonCopyingRead()) + , actively_prefetch(actively_prefetch_) +{ +} + +ReadBufferFromNexusFS::~ReadBufferFromNexusFS() +{ + try + { + resetPrefetch(); + } + catch (Exception & e) + { + LOG_WARNING(log, "resetPrefetch raises exception: {}", e.message()); + } +} + +bool ReadBufferFromNexusFS::nextImpl() +{ + if (!hasPendingDataToRead()) + return false; + + if (!read_to_internal_buffer) + { + // read from nexusfs by non-copying method + nexusfs_buffer.reset(); + + // first, check if there prefetched data + if (prefetch_future.valid()) + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSAsynchronousWaitMicroseconds); + CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait}; + + if (prefetch_future.wait_for(std::chrono::seconds(0)) == std::future_status::ready) + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchTaskNotWait); + else + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchTaskWait); + + nexusfs_buffer = prefetch_future.get(); + auto size = nexusfs_buffer.getSize(); + + prefetch_future = {}; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedReads); + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedBytes, size); + } + else + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds); + size_t max_size_to_read = read_until_position ? read_until_position - offset : buf_size; + nexusfs_buffer = nexus_fs.read( + file_name, + offset, + max_size_to_read, + source_read_buffer); + } + + size_t bytes_read = nexusfs_buffer.getSize(); + if (bytes_read == 0) + return false; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSReadBytes, bytes_read); + BufferBase::set(nexusfs_buffer.getData(), bytes_read, 0); + offset += bytes_read; + + if (actively_prefetch) + prefetch(Priority{0}); + + return true; + } + + size_t max_size_to_read = internal_buffer.size(); + if (read_until_position) + { + max_size_to_read = std::min(max_size_to_read, static_cast(read_until_position - offset)); + } + + size_t total_bytes_read = 0; + { + ProfileEventTimeIncrement watch(ProfileEvents::ReadFromNexusFSSynchronousWaitMicroseconds); + do + { + size_t bytes_read = nexus_fs.read( + file_name, + offset + total_bytes_read, + max_size_to_read - total_bytes_read, + source_read_buffer, + internal_buffer.begin() + total_bytes_read); + + if (bytes_read == 0) + break; + total_bytes_read += bytes_read; + } + while (total_bytes_read < max_size_to_read); + } + + if (total_bytes_read) + { + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSReadBytes, total_bytes_read); + working_buffer = internal_buffer; + working_buffer.resize(total_bytes_read); + offset += total_bytes_read; + return true; + } + + return false; +} + +off_t ReadBufferFromNexusFS::seek(off_t offset_, int whence) +{ + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSSeeks); + if (whence == SEEK_CUR) + offset_ = getPosition() + offset_; + else if (whence != SEEK_SET) + throw Exception("Seek expects SEEK_SET or SEEK_CUR as whence", ErrorCodes::BAD_ARGUMENTS); + + if (offset_ < 0) + throw Exception("Seek position is out of bounds. Offset: " + std::to_string(offset_), ErrorCodes::SEEK_POSITION_OUT_OF_BOUND); + + if (offset_ == getPosition()) + return offset_; + + if (!working_buffer.empty() + && static_cast(offset_) >= offset - working_buffer.size() + && offset_ < offset) + { + pos = working_buffer.end() - (offset - offset_); + assert(pos >= working_buffer.begin()); + assert(pos < working_buffer.end()); + + return getPosition(); + } + + resetWorkingBuffer(); + resetPrefetch(); + offset = offset_; + + return offset; +} + +IAsynchronousReader::Result ReadBufferFromNexusFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) +{ + bool result = false; + offset = read_offset; + set(data, size); + + auto original_status = read_to_internal_buffer; + read_to_internal_buffer = true; + + if (ignore_bytes) + { + ignore(ignore_bytes); + result = hasPendingData(); + ignore_bytes = 0; + } + if (!result) + result = next(); + + read_to_internal_buffer = original_status; + + if (result) + { + assert(available()); + return { working_buffer.size(), BufferBase::offset(), nullptr }; + } + + return {0, 0, nullptr}; +} + +bool ReadBufferFromNexusFS::hasPendingDataToRead() +{ + if (read_until_position) + { + if (read_until_position == offset) + return false; + + if (read_until_position < offset) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1); + } + } + + return true; +} + +void ReadBufferFromNexusFS::prefetch(Priority) +{ + if (!nexus_fs.supportPrefetch()) + return; + + chassert(!read_to_internal_buffer); + + if (prefetch_future.valid()) + return; + + if (!hasPendingDataToRead()) + return; + + size_t max_size_to_read = read_until_position ? read_until_position - offset : buf_size; + + prefetch_future = nexus_fs.prefetchToBuffer(file_name, offset, max_size_to_read, source_read_buffer); + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchRequests); +} + +size_t ReadBufferFromNexusFS::readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) +{ + if (n == 0) + return 0; + + size_t bytes_read = nexus_fs.read(file_name, range_begin, n, source_read_buffer, to); + + if (bytes_read && progress_callback) + progress_callback(bytes_read); + return bytes_read; +} + +void ReadBufferFromNexusFS::setReadUntilPosition(size_t position) +{ + if (position != static_cast(read_until_position)) + { + offset = getPosition(); + resetWorkingBuffer(); + resetPrefetch(); + read_until_position = position; + } +} + +void ReadBufferFromNexusFS::setReadUntilEnd() +{ + if (read_until_position) + { + offset = getPosition(); + resetWorkingBuffer(); + read_until_position = 0; + } +} + +void ReadBufferFromNexusFS::resetPrefetch() +{ + if (!prefetch_future.valid()) + return; + + auto bwh = prefetch_future.get(); + prefetch_future = {}; + + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSPrefetchedBytes, bwh.getSize()); + ProfileEvents::increment(ProfileEvents::ReadFromNexusFSUnusedPrefetches); +} + +} diff --git a/src/IO/ReadBufferFromFileWithNexusFS.h b/src/IO/ReadBufferFromNexusFS.h similarity index 66% rename from src/IO/ReadBufferFromFileWithNexusFS.h rename to src/IO/ReadBufferFromNexusFS.h index 2c50b1c965..18d3e5e20b 100644 --- a/src/IO/ReadBufferFromFileWithNexusFS.h +++ b/src/IO/ReadBufferFromNexusFS.h @@ -4,25 +4,30 @@ #include "IO/SeekableReadBuffer.h" #include +#include +#include namespace DB { -class ReadBufferFromFileWithNexusFS : public ReadBufferFromFileBase +class ReadBufferFromNexusFS : public ReadBufferFromFileBase { public: - explicit ReadBufferFromFileWithNexusFS( + explicit ReadBufferFromNexusFS( size_t buf_size, + bool actively_prefetch, std::unique_ptr source_read_buffer, NexusFS &nexus_fs); - ~ReadBufferFromFileWithNexusFS() override = default; + ~ReadBufferFromNexusFS() override; bool nextImpl() override; off_t seek(off_t off, int whence) override; + void prefetch(Priority priority) override; + IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore) override; size_t readBigAt(char * to, size_t n, size_t range_begin, const std::function & progress_callback) override; @@ -38,14 +43,26 @@ class ReadBufferFromFileWithNexusFS : public ReadBufferFromFileBase bool isSeekCheap() override { return false; } private: - LoggerPtr log = getLogger("ReadBufferFromFileWithNexusFS"); + + bool hasPendingDataToRead(); + + void resetPrefetch(); + + LoggerPtr log = getLogger("ReadBufferFromNexusFS"); const String file_name; std::unique_ptr source_read_buffer; NexusFS &nexus_fs; + const size_t buf_size = 0; off_t offset = 0; off_t read_until_position = 0; + + bool read_to_internal_buffer = false; + NexusFSBufferWithHandle nexusfs_buffer; + + const bool actively_prefetch = false; + std::future prefetch_future; }; } diff --git a/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp b/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp index 085ffb866b..29324ab2dd 100644 --- a/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp +++ b/src/IO/tests/gtest_read_buffer_with_nexus_fs.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -32,6 +32,8 @@ class ReadIndirectBuffer final : public ReadBufferFromFileBase off_t getPosition() override { return pos - working_buffer.begin(); } + size_t getFileSize() override { return working_buffer.size(); } + off_t seek(off_t off, int whence) override { impl.swap(*this); @@ -58,15 +60,24 @@ class ReadIndirectBuffer final : public ReadBufferFromFileBase const String path; }; -TEST(ReadBufferFromFileWithNexusFSTest, Read) +TEST(ReadBufferFromNexusFSTest, Read) { + const UInt32 segment_size = 128; AutoPtr conf(new MapConfiguration()); conf->setBool("nexus_fs.use_memory_device", true); - conf->setUInt64("nexus_fs.cache_size", 64 * MiB); + conf->setUInt64("nexus_fs.cache_size", 512 * 10); conf->setUInt64("nexus_fs.region_size", 512); - conf->setUInt64("nexus_fs.segment_size", 128); + conf->setUInt64("nexus_fs.segment_size", segment_size); conf->setUInt("nexus_fs.alloc_align_size", 32); conf->setUInt("nexus_fs.io_align_size", 32); + conf->setUInt("nexus_fs.clean_regions_pool", 3); + conf->setUInt("nexus_fs.clean_region_threads", 2); + conf->setUInt("nexus_fs.num_in_mem_buffers", 6); + conf->setBool("nexus_fs.enable_memory_buffer", true); + conf->setUInt("nexus_fs.reader_threads", 8); + conf->setUInt64("nexus_fs.memory_buffer_size", 128 * 6); + conf->setDouble("nexus_fs.memory_buffer_cooling_percent", 0.4); + conf->setDouble("nexus_fs.memory_buffer_freed_percent", 0.2); NexusFSConfig nexusfs_conf; nexusfs_conf.loadFromConfig(*conf); @@ -79,38 +90,47 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) String large_data; for (int i = 0; i < large_len; i++) large_data.push_back(i % 26 + 'a'); + for (int i = 0; i < large_len; i += segment_size) + { + auto s = fmt::format("seg#{}", i / segment_size); + for (int j = 0; j < s.size(); j++) + { + if (i + j < large_len) + large_data[i + j] = s[j]; + } + } // small read { auto source = std::make_unique("file1", small_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[small_len + 5]; memset(buffer, 0, small_len + 5); auto bytes_read = read_buffer.readBig(buffer, small_len); ASSERT_EQ(bytes_read, small_len); - ASSERT_TRUE(strcmp(buffer, small_data.c_str()) == 0); + EXPECT_STREQ(buffer, small_data.c_str()); } // large read { auto source = std::make_unique("file2", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); char buffer[large_len + 5]; memset(buffer, 0, large_len + 5); auto bytes_read = read_buffer.readBig(buffer, large_len); ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); + EXPECT_STREQ(buffer, large_data.c_str()); } // with seek { constexpr int off = 200; auto source = std::make_unique("file3", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); char buffer[large_len - off + 5]; memset(buffer, 0, large_len - off + 5); @@ -118,7 +138,21 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.readBig(buffer, large_len - off); ASSERT_EQ(bytes_read, large_len - off); - ASSERT_TRUE(strcmp(buffer, large_data.substr(off).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(off).c_str()); + } + + // read nexus_fs disk cache + { + String data; + auto fake_source = std::make_unique("file2", data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(fake_source), *nexus_fs); + + char buffer[large_len + 5]; + memset(buffer, 0, large_len + 5); + auto bytes_read = read_buffer.readBig(buffer, large_len); + + ASSERT_EQ(bytes_read, large_len); + EXPECT_STREQ(buffer, large_data.c_str()); } // multi thread @@ -128,14 +162,35 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) for (int i = 0; i < n; i++) threads[i] = std::thread([&](){ auto source = std::make_unique("file4", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); + + char buffer[large_len + 5]; + memset(buffer, 0, large_len + 5); + auto bytes_read = read_buffer.readBig(buffer, large_len); + + ASSERT_EQ(bytes_read, large_len); + EXPECT_STREQ(buffer, large_data.c_str()); + }); + + for (int i = 0; i < n; i++) + threads[i].join(); + } + + // multi thread, non-aligned buffer size + { + constexpr int n = 20; + std::vector threads(n); + for (int i = 0; i < n; i++) + threads[i] = std::thread([&](){ + auto source = std::make_unique("file5", large_data); + ReadBufferFromNexusFS read_buffer(93, true, std::move(source), *nexus_fs); char buffer[large_len + 5]; memset(buffer, 0, large_len + 5); auto bytes_read = read_buffer.readBig(buffer, large_len); ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); + EXPECT_STREQ(buffer, large_data.c_str()); }); for (int i = 0; i < n; i++) @@ -150,8 +205,8 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) std::vector threads(n); for (int i = 0; i < n; i++) threads[i] = std::thread([&](){ - auto source = std::make_unique("file5", large_data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(source), *nexus_fs); + auto source = std::make_unique("file6", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(source), *nexus_fs); std::default_random_engine local_generator; local_generator.seed(i); @@ -165,7 +220,7 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, local_buffer_size); ASSERT_EQ(bytes_read, local_buffer_size); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset, local_buffer_size).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset, local_buffer_size).c_str()); } }); @@ -173,27 +228,12 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) threads[i].join(); } - // read nexus_fs disk cache - { - String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); - - char buffer[large_len + 5]; - memset(buffer, 0, large_len + 5); - auto bytes_read = read_buffer.readBig(buffer, large_len); - - ASSERT_EQ(bytes_read, large_len); - ASSERT_TRUE(strcmp(buffer, large_data.c_str()) == 0); - } - // read until pos { constexpr int until_pos = 678; constexpr int offset = 123; - String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + auto source = std::make_unique("file2", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[until_pos - offset + 5]; memset(buffer, 0, until_pos - offset + 5); @@ -202,15 +242,14 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, large_len); ASSERT_EQ(bytes_read, until_pos - offset); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset, bytes_read).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset, bytes_read).c_str()); } // read until end { - constexpr int offset = 256; - String data; - auto fake_source = std::make_unique("file3", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + constexpr int offset = 200; + auto source = std::make_unique("file5", large_data); + ReadBufferFromNexusFS read_buffer(segment_size, true, std::move(source), *nexus_fs); char buffer[large_len - offset + 5]; memset(buffer, 0, large_len - offset + 5); @@ -219,15 +258,15 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) auto bytes_read = read_buffer.read(buffer, large_len); ASSERT_EQ(bytes_read, large_len - offset); - ASSERT_TRUE(strcmp(buffer, large_data.substr(offset).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(offset).c_str()); } // readInto { - constexpr int off = 200; + constexpr int off = 256; String data; - auto fake_source = std::make_unique("file2", data); - ReadBufferFromFileWithNexusFS read_buffer(128, std::move(fake_source), *nexus_fs); + auto fake_source = std::make_unique("file5", data); + ReadBufferFromNexusFS read_buffer(segment_size, false, std::move(fake_source), *nexus_fs); char buffer[large_len - off + 5]; memset(buffer, 0, large_len - off + 5); @@ -243,7 +282,7 @@ TEST(ReadBufferFromFileWithNexusFSTest, Read) } ASSERT_EQ(bytes_read, large_len - off); - ASSERT_TRUE(strcmp(buffer, large_data.substr(off).c_str()) == 0); + EXPECT_STREQ(buffer, large_data.substr(off).c_str()); } conf.reset(); diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 90963a6ffd..00683ce470 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -723,6 +724,15 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti } } + { + if (auto nexus_fs = getContext()->getNexusFS()) + { + new_values["NexusFSNumSegments"] = nexus_fs->getNumSegments(); + new_values["NexusFSNumFiles"] = nexus_fs->getNumFileMetas(); + new_values["NexusFSNumInodes"] = nexus_fs->getNumInodes(); + } + } + if (auto gin_store_reader_factory = getContext()->getGINStoreReaderFactory()) { new_values["GINReaderFactoryCacheBytes"] = gin_store_reader_factory->residentMemory(); diff --git a/src/Protos/disk_cache.proto b/src/Protos/disk_cache.proto index 1739a07a37..254ebc2732 100644 --- a/src/Protos/disk_cache.proto +++ b/src/Protos/disk_cache.proto @@ -70,6 +70,33 @@ message NexusFSIndexBucket { repeated NexusFSIndexEntry entries = 2; } +message NexusFSFileSegment { + required uint64 segment_id = 1; + required uint32 address_rid = 2; + required uint32 address_offset = 3; + required uint32 size = 4; +} + +message NexusFSFileMeta +{ + required string file_name = 1; + required uint64 file_size = 2; + repeated NexusFSFileSegment segments = 3; +} + +message NexusFSInode { + required string node_key = 1; + required uint64 node_id = 2; + repeated NexusFSFileMeta files = 3; +} + +message NexusFSInodeManager { + required string prefix = 1; + required string surfix = 2; + required NexusFSInode root_inode = 3; + repeated NexusFSInode inodes = 4; +} + message NexusFSConfig { required uint64 version = 1; required uint64 cache_size = 2; @@ -77,8 +104,6 @@ message NexusFSConfig { required uint32 alloc_align_size = 4; required uint32 region_size = 5; required uint32 segment_size = 6; - optional uint64 hole_count = 7; - optional uint64 hole_size_total = 8; - optional bool reinsertion_policy_enabled = 9; - optional uint64 used_size_bytes = 10; + required string file_prefix = 7; + required string file_surfix = 8; } diff --git a/src/Storages/DiskCache/Device.cpp b/src/Storages/DiskCache/Device.cpp index 8aa6b9f705..09341cbb69 100644 --- a/src/Storages/DiskCache/Device.cpp +++ b/src/Storages/DiskCache/Device.cpp @@ -463,7 +463,7 @@ namespace auto cur_time = getSteadyClock(); auto delay_ms = toMillis(cur_time - start_time).count(); if (delay_ms > static_cast(kIOTimeoutMs)) - LOG_ERROR(getLogger("Device"), + LOG_WARNING(getLogger("Device"), "[{}] IO timeout {}ms (submit +{}ms comp +{}ms): {}", parent.context.getName(), delay_ms, @@ -526,7 +526,7 @@ namespace delay_ms = toMillis(cur_time - comp_time).count(); if (delay_ms > static_cast(kIOTimeoutMs)) - LOG_ERROR(getLogger("Device"), + LOG_WARNING(getLogger("Device"), "[{}] IOReq timeout {}ms (comp +{}ms notify +{}ms): {}", context.getName(), delay_ms, diff --git a/src/Storages/DiskCache/Region.cpp b/src/Storages/DiskCache/Region.cpp index 1199f84d6e..bf7a708749 100644 --- a/src/Storages/DiskCache/Region.cpp +++ b/src/Storages/DiskCache/Region.cpp @@ -1,5 +1,6 @@ #include #include +#include #include @@ -182,4 +183,24 @@ void Region::readFromBuffer(UInt32 from_offset, size_t size, char *to) const memcpy(to, buffer->data() + from_offset, size); } +void Region::addHandle(std::shared_ptr &handle) +{ + std::lock_guard g{lock}; + handles.push_back(handle); +} + +void Region::resetHandles() +{ + std::lock_guard g{lock}; + for (auto &handle : handles) + handle->invalidRelAddress(); + handles.clear(); +} + +void Region::getHandles(std::vector> &handles_) +{ + std::lock_guard g{lock}; + handles_ = handles; +} + } diff --git a/src/Storages/DiskCache/Region.h b/src/Storages/DiskCache/Region.h index 1fe46aca9e..39108f6a1a 100644 --- a/src/Storages/DiskCache/Region.h +++ b/src/Storages/DiskCache/Region.h @@ -12,6 +12,11 @@ #include #include +namespace DB::NexusFSComponents +{ +class BlockHandle; +} + namespace DB::HybridCache { @@ -183,23 +188,9 @@ class Region // Returns the region id. RegionId id() const { return region_id; } - void addKey(UInt64 key) - { - std::lock_guard g{lock}; - keys.push_back(key); - } - - void resetKeys() - { - std::lock_guard g{lock}; - keys.clear(); - } - - void getKeys(std::vector &keys_) - { - std::lock_guard g{lock}; - keys_ = keys; - } + void addHandle(std::shared_ptr &handle); + void resetHandles(); + void getHandles(std::vector> &handles_); private: UInt32 activeOpenLocked() const; @@ -232,7 +223,7 @@ class Region UInt32 num_items{0}; std::unique_ptr buffer{nullptr}; - std::vector keys; + std::vector> handles; mutable TimedMutex lock{TimedMutex::Options(false)}; mutable ConditionVariable cond; diff --git a/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp b/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp index c762b66947..a1c0ec7806 100644 --- a/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromByteHDFS.cpp @@ -20,7 +20,6 @@ #include "Common/ProfileEvents.h" #include "Common/Stopwatch.h" -#include "Storages/HDFS/HDFSCommon.h" #include "Common/Exception.h" #include "common/sleep.h" @@ -116,7 +115,6 @@ static void doWithRetry(std::function func) struct ReadBufferFromByteHDFS::ReadBufferFromHDFSImpl { - HDFSConnectionParams hdfs_params; bool pread {false}; RemoteReadLog * remote_read_log; String remote_read_context; @@ -134,8 +132,7 @@ struct ReadBufferFromByteHDFS::ReadBufferFromHDFSImpl const HDFSConnectionParams & hdfs_params_, size_t read_until_position_, const ReadSettings & settings_) - : hdfs_params(hdfs_params_) - , pread(settings_.byte_hdfs_pread) + : pread(settings_.byte_hdfs_pread) , remote_read_log(settings_.remote_read_log) , remote_read_context(settings_.remote_read_context) , read_until_position(read_until_position_) @@ -263,14 +260,23 @@ ReadBufferFromByteHDFS::ReadBufferFromByteHDFS( off_t read_until_position_, std::optional file_size_) : ReadBufferFromFileBase(use_external_buffer_ ? 0 : read_settings.remote_fs_buffer_size, existing_memory_, alignment_, file_size_) + , hdfs_file_path(hdfs_file_path_) + , hdfs_params(hdfs_params_) + , read_until_position(read_until_position_) , settings(read_settings) - , impl(std::make_unique(hdfs_file_path_, hdfs_params_, read_until_position_, settings)) + , impl(nullptr) , total_network_throttler(settings.remote_throttler) { } ReadBufferFromByteHDFS::~ReadBufferFromByteHDFS() = default; +void ReadBufferFromByteHDFS::initImpl() +{ + chassert(!impl); + impl = std::make_unique(hdfs_file_path, hdfs_params, read_until_position, settings); +} + IAsynchronousReader::Result ReadBufferFromByteHDFS::readInto(char * data, size_t size, size_t read_offset, size_t ignore_bytes) { /** @@ -310,6 +316,8 @@ IAsynchronousReader::Result ReadBufferFromByteHDFS::readInto(char * data, size_t bool ReadBufferFromByteHDFS::nextImpl() { + if (!impl) + initImpl(); int bytes_read = impl->readImpl(internal_buffer.begin(), internal_buffer.size()); if (bytes_read) { @@ -339,6 +347,8 @@ off_t ReadBufferFromByteHDFS::seek(off_t offset_, int whence_) /// impl->getPosition() is the file position of the working buffer end /// Therefore working buffer corresponds to the file range /// [impl->getPosition() - working_buffer.size(), impl->getPosition()] + if (!impl) + initImpl(); if (!working_buffer.empty() && size_t(offset_) >= impl->getPosition() - working_buffer.size() && offset_ <= impl->getPosition()) @@ -356,6 +366,8 @@ off_t ReadBufferFromByteHDFS::seek(off_t offset_, int whence_) off_t ReadBufferFromByteHDFS::getPosition() { + if (!impl) + initImpl(); return impl->getPosition() - available(); } @@ -363,6 +375,8 @@ size_t ReadBufferFromByteHDFS::getFileSize() { if (file_size) return *file_size; + if (!impl) + initImpl(); file_size = impl->getFileSize(); return *file_size; @@ -370,20 +384,27 @@ size_t ReadBufferFromByteHDFS::getFileSize() String ReadBufferFromByteHDFS::getFileName() const { - return impl->hdfs_file_path; + return hdfs_file_path; } void ReadBufferFromByteHDFS::setReadUntilPosition(size_t position) { + if (!impl) + initImpl(); impl->setReadUntilPosition(position); } void ReadBufferFromByteHDFS::setReadUntilEnd() { + if (!impl) + initImpl(); impl->setReadUntilEnd(); } -size_t ReadBufferFromByteHDFS::getFileOffsetOfBufferEnd() const { +size_t ReadBufferFromByteHDFS::getFileOffsetOfBufferEnd() const +{ + if (!impl) + return 0; // file_offset=0 at the construction of ReadBufferFromHDFSImpl return impl->file_offset; } @@ -393,7 +414,7 @@ size_t ReadBufferFromByteHDFS::readBigAt(char * to, size_t n, size_t range_begin return 0; auto pooled_impl = impl_pool.get([this] (){ - return new ReadBufferFromHDFSImpl(impl->hdfs_file_path, impl->hdfs_params, 0, settings); + return new ReadBufferFromHDFSImpl(hdfs_file_path, hdfs_params, 0, settings); }); pooled_impl->seek(range_begin); diff --git a/src/Storages/HDFS/ReadBufferFromByteHDFS.h b/src/Storages/HDFS/ReadBufferFromByteHDFS.h index 54e8bea022..cf87e4b181 100644 --- a/src/Storages/HDFS/ReadBufferFromByteHDFS.h +++ b/src/Storages/HDFS/ReadBufferFromByteHDFS.h @@ -17,6 +17,7 @@ #include "Common/ObjectPool.h" #include "Common/config.h" +#include "Storages/HDFS/HDFSCommon.h" #if USE_HDFS #include "Core/Defines.h" @@ -65,6 +66,12 @@ struct ReadBufferFromHDFSImpl; bool isSeekCheap() override { return true; } private: + + void initImpl(); + + const String hdfs_file_path; + const HDFSConnectionParams hdfs_params; + const off_t read_until_position; ReadSettings settings; std::unique_ptr impl; ThrottlerPtr total_network_throttler; diff --git a/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp b/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp index 0574da386b..58ea6e1430 100644 --- a/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp +++ b/src/Storages/MergeTree/MergeTreeCNCHDataDumper.cpp @@ -584,21 +584,35 @@ size_t MergeTreeCNCHDataDumper::writeProjectionPart( { auto & checksums_files = projection_part->checksums_ptr->files; reordered_checksums.reserve(checksums_files.size()); - for (const auto & col_name : projection_description.column_names) + auto add_to_reordered_checksums = [&](std::vector extensions) { - const auto & name = ISerialization::getFileNameForStream(col_name, {}); - for (const auto & extension : {".bin", ".mrk"}) + for (const auto & col_name : projection_description.column_names) { - if (auto it = checksums_files.find(name + extension); it != checksums_files.end() && !it->second.is_deleted) + const auto & name = ISerialization::getFileNameForStream(col_name, {}); + for (const auto & extension : extensions) { - reordered_checksums.push_back(&*it); - } - else - { - LOG_ERROR(log, "Fail to find column {} in projection {}", name + extension, projection_name); + if (auto it = checksums_files.find(name + extension); it != checksums_files.end() && !it->second.is_deleted) + { + reordered_checksums.push_back(&*it); + } + else + { + LOG_ERROR(log, "Fail to find column {} in projection {}", name + extension, projection_name); + } } } + }; + if (version == MERGE_TREE_CHCH_DATA_STORAGTE_CONCENTRATED_MARK_LAYOUT_VERSION) + { + add_to_reordered_checksums({".mrk"}); + add_to_reordered_checksums({".bin"}); + } + else + { + add_to_reordered_checksums({".bin", ".mrk"}); } + + for (auto & file : reordered_checksums) { file->second.file_offset = data_file_offset; diff --git a/src/Storages/NexusFS/HitsReinsertionPolicy.h b/src/Storages/NexusFS/HitsReinsertionPolicy.h deleted file mode 100644 index 80397f596c..0000000000 --- a/src/Storages/NexusFS/HitsReinsertionPolicy.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace DB::NexusFSComponents -{ -class NexusFSHitsReinsertionPolicy : public HybridCache::BlockCacheReinsertionPolicy -{ -public: - explicit NexusFSHitsReinsertionPolicy(UInt8 hits_threshold_, const NexusFSIndex & index_) : hits_threshold{hits_threshold_}, index{index_} { } - - bool shouldReinsert(StringRef key) override - { - const auto lr = index.peek(makeHashKey(HybridCache::BufferView{key.size, reinterpret_cast(key.data)}).keyHash()); - return lr.isFound() && lr.getCurrentHits() >= hits_threshold; - } - -private: - const UInt8 hits_threshold{}; - - const NexusFSIndex & index; -}; -} diff --git a/src/Storages/NexusFS/NexusFS.cpp b/src/Storages/NexusFS/NexusFS.cpp index c3c845add9..61a4b6742c 100644 --- a/src/Storages/NexusFS/NexusFS.cpp +++ b/src/Storages/NexusFS/NexusFS.cpp @@ -1,46 +1,65 @@ +#include + #include #include #include +#include #include #include #include -#include -#include #include -#include +#include #include -#include +#include #include #include -#include -#include +#include #include -#include "common/unit.h" #include #include +#include #include #include #include #include +#include +#include namespace ProfileEvents { -extern const Event NexusFSDiskCacheHit; -extern const Event NexusFSDiskCacheHitInflightInsert; -extern const Event NexusFSDiskCacheMiss; +extern const Event NexusFSHit; +extern const Event NexusFSHitInflightInsert; +extern const Event NexusFSMiss; +extern const Event NexusFSPreload; +extern const Event NexusFSDeepRetry; extern const Event NexusFSDiskCacheEvict; -extern const Event NexusFSDiskCachePreload; -extern const Event NexusFSDiskCacheLookupRetries; extern const Event NexusFSDiskCacheInsertRetries; extern const Event NexusFSDiskCacheError; extern const Event NexusFSDiskCacheBytesRead; extern const Event NexusFSDiskCacheBytesWrite; -extern const Event NexusFSMemoryBufferHit; -extern const Event NexusFSMemoryBufferMiss; -extern const Event NexusFSMemoryBufferError; -extern const Event NexusFSMemoryBufferBytesRead; +extern const Event NexusFSReadFromInsertCxt; +extern const Event NexusFSReadFromInsertCxtRetry; +extern const Event NexusFSReadFromInsertCxtDeepRetry; +extern const Event NexusFSReadFromInsertCxtBytesRead; +extern const Event NexusFSReadFromInsertCxtNonCopy; +extern const Event NexusFSReadFromInsertCxtNonCopyBytesRead; +extern const Event NexusFSReadFromDisk; +extern const Event NexusFSReadFromDiskRetry; +extern const Event NexusFSReadFromDiskDeepRetry; +extern const Event NexusFSReadFromDiskBytesRead; +extern const Event NexusFSReadFromBuffer; +extern const Event NexusFSReadFromBufferRetry; +extern const Event NexusFSReadFromBufferDeepRetry; +extern const Event NexusFSReadFromBufferBytesRead; +extern const Event NexusFSReadFromBufferNonCopy; +extern const Event NexusFSReadFromBufferNonCopyBytesRead; +extern const Event NexusFSReadFromSourceBytesRead; +extern const Event NexusFSReadFromSourceMicroseconds; +extern const Event NexusFSTimeout; +extern const Event NexusFSPrefetchToBuffer; +extern const Event NexusFSPrefetchToBufferBytesRead; } namespace DB::ErrorCodes @@ -49,6 +68,8 @@ extern const int INVALID_CONFIG_PARAMETER; extern const int NOT_IMPLEMENTED; extern const int TIMEOUT_EXCEEDED; extern const int CANNOT_OPEN_FILE; +extern const int CANNOT_FSTAT; +extern const int CANNOT_TRUNCATE_FILE; } namespace DB @@ -67,38 +88,33 @@ UInt64 alignUp(UInt64 num, UInt64 alignment) return alignDown(num + alignment - 1, alignment); } -File NexusFSConfig::openFile(const std::string & file_name, UInt64 size, bool truncate) +File NexusFSConfig::openFile(const std::string & file_name, UInt64 size, bool truncate, bool direct_io) { LOG_INFO(log, "create file: {} sie: {}, truncate: {}", file_name, size, truncate); if (file_name.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "file name is empty"); - // TODO: use DIRECT_IO int flags{O_RDWR | O_CREAT}; + if (direct_io) + flags |= O_DIRECT; File f = File(file_name.c_str(), flags); chassert(f.getFd() >= 0); struct stat file_stat; if (fstat(f.getFd(), &file_stat) < 0) - throw std::system_error(errno, std::system_category(), fmt::format("failed to get the file stat for file {}", file_name)); + throwFromErrno(fmt::format("failed to get the file stat for file {}", file_name), ErrorCodes::CANNOT_FSTAT); UInt64 cur_file_size = file_stat.st_size; if (truncate && cur_file_size < size) { if (::ftruncate(f.getFd(), size) < 0) - throw std::system_error( - errno, - std::system_category(), - fmt::format("ftruncate failed with requested size {}, current size {}", size, cur_file_size)); + throwFromErrno( + fmt::format("ftruncate failed with requested size {}, current size {}", size, cur_file_size), + ErrorCodes::CANNOT_TRUNCATE_FILE); - LOG_INFO( - log, - "cache file {} is ftruncated from {} bytes to {} bytes", - file_name, - cur_file_size, - size); + LOG_INFO(log, "cache file {} is ftruncated from {} bytes to {} bytes", file_name, cur_file_size, size); } return f; @@ -110,16 +126,24 @@ void NexusFSConfig::loadFromConfig(const Poco::Util::AbstractConfiguration & con cache_size = conf.getUInt64(config_name + ".cache_size", 10 * GiB); region_size = conf.getUInt64(config_name + ".region_size", 1 * MiB); segment_size = conf.getUInt64(config_name + ".segment_size", 128 * KiB); - alloc_align_size = conf.getUInt(config_name + ".alloc_align_size", 4096); + alloc_align_size = conf.getUInt(config_name + ".alloc_align_size", 512); io_align_size = conf.getUInt(config_name + ".io_align_size", 4096); stripe_size = conf.getUInt(config_name + ".stripe_size", 4096); + reader_threads = conf.getUInt(config_name + ".reader_threads", getNumberOfPhysicalCPUCores() >> 1); clean_regions_pool = conf.getUInt(config_name + ".clean_regions_pool", 4); clean_region_threads = conf.getUInt(config_name + ".clean_region_threads", 2); num_in_mem_buffers = conf.getUInt(config_name + ".num_in_mem_buffers", 8); - memory_cache_size = conf.getUInt64(config_name + ".memory_cache_size", 1 * GiB); + enable_memory_buffer = conf.getBool(config_name + ".enable_memory_buffer", false); + support_prefetch = conf.getBool(config_name + ".support_prefetch", true); + memory_buffer_size = conf.getUInt64(config_name + ".memory_buffer_size", 10 * GiB); + memory_buffer_cooling_percent = conf.getDouble(config_name + ".memory_buffer_cooling_percent", 0.1); + memory_buffer_freed_percent = conf.getDouble(config_name + ".memory_buffer_freed_percent", 0.05); timeout_ms = conf.getUInt(config_name + ".timeout_ms", 10000); + filemeta_gc_interval_s = conf.getUInt(config_name + ".filemeta_gc_interval_s", 300); bool use_memory_device = conf.getBool(config_name + ".use_memory_device", false); bool enable_async_io = conf.getBool(config_name + ".enable_async_io", false); + file_prefix = conf.getString(config_name + ".file_prefix", ""); + file_surfix = conf.getString(config_name + ".file_surfix", ""); double metadata_percentage = conf.getDouble(config_name + ".metadata_percentage", 0.01); metadata_size = alignUp(static_cast(metadata_percentage * cache_size), region_size); @@ -147,24 +171,16 @@ void NexusFSConfig::loadFromConfig(const Poco::Util::AbstractConfiguration & con File f; try { - f = openFile(path, cache_size, true); + f = openFile(path, cache_size, true, false); } - catch (const std::exception & e) + catch (const ErrnoException & e) { - LOG_ERROR( - &Poco::Logger::get("NexusFSConfig"), "Exception in openFile {}, error: {} errno: {}", path, e.what(), errno); + LOG_ERROR(getLogger("NexusFSConfig"), "Exception in openFile {}, error: {} errno: {}", path, e.what(), e.getErrno()); throw; } file_vec.push_back(std::move(f)); } - device = createDirectIoFileDevice( - std::move(file_vec), - cache_size, - io_align_size, - stripe_size, - 0, - io_engine, - q_depth); + device = createDirectIoFileDevice(std::move(file_vec), cache_size, io_align_size, stripe_size, 0, io_engine, q_depth); } validate(); @@ -179,16 +195,17 @@ NexusFSConfig & NexusFSConfig::validate() if (cache_size <= 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "invalid size"); if (cache_size % region_size != 0) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, - fmt::format("Cache size is not aligned to region size! cache size: {} region size: {]}", cache_size, region_size)); + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + fmt::format("Cache size is not aligned to region size! cache size: {} region size: {}", cache_size, region_size)); if (getNumberRegions() < clean_regions_pool) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "not enough space on device"); if (num_in_mem_buffers == 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "there must be at least one in-mem buffers"); if (num_priorities == 0) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "allocator must have at least one priority"); - - reinsertion_config.validate(); + if (reader_threads == 0) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "reader threads must greater than 0"); return *this; } @@ -198,10 +215,12 @@ NexusFS::NexusFS(NexusFSConfig && config) : serialized_config{serializeConfig(config)} , device{std::move(config.device)} , alloc_align_size{config.alloc_align_size} - , region_size{config.region_size} , metadata_size(config.metadata_size) , segment_size(config.segment_size) , timeout_ms(config.timeout_ms) + , file_prefix(config.file_prefix) + , file_surfix(config.file_surfix) + , index(getFilePrefix(), getFileSurfix(), getSegmentSize()) // , num_priorities{config.num_priorities} // , check_expired{std::move(config.check_expired)} // , destructor_callback{std::move(config.destructor_callback)} @@ -221,14 +240,32 @@ NexusFS::NexusFS(NexusFSConfig && config) config.num_priorities, config.in_mem_buf_flush_retry_limit} , allocator{region_manager, config.num_priorities} - , reinsertion_policy{makeReinsertionPolicy(config.reinsertion_config)} - , enable_segment_cache(false) - // , segment_cache(config.memory_cache_size / segment_size) + , enable_buffer(config.enable_memory_buffer) + , support_prefetch(config.support_prefetch) + , buffer_manager( + config.enable_memory_buffer ? + BufferManager::initInstance( + config.memory_buffer_size, + config.segment_size, + config.filemeta_gc_interval_s, + config.memory_buffer_cooling_percent, + config.memory_buffer_freed_percent, + region_manager, + index) + : nullptr) { + if (support_prefetch) + { + for (uint32_t i = 0; i < config.reader_threads; i++) + { + auto name = fmt::format("NexusFS_read_worker_{}", i); + reader_workers.emplace_back(std::make_unique(name)); + } + } LOG_TRACE(log, "NexusFS created"); } -void NexusFS::preload(const String &file, const OffsetAndSizeVector &offsets_and_sizes, std::unique_ptr &source) +void NexusFS::preload(const String & file, const OffsetAndSizeVector & offsets_and_sizes, std::unique_ptr & source) { std::unordered_set segment_ids; for (const auto & [offset, size] : offsets_and_sizes) @@ -240,166 +277,539 @@ void NexusFS::preload(const String &file, const OffsetAndSizeVector &offsets_and } LOG_TRACE(log, "preload {} segments from {}", segment_ids.size(), file); - ProfileEvents::increment(ProfileEvents::NexusFSDiskCachePreload, segment_ids.size()); + ProfileEvents::increment(ProfileEvents::NexusFSPreload, segment_ids.size()); for (auto id : segment_ids) { String segment_name = getSegmentName(file, id); - off_t offset_in_source = getOffsetInSourceFile(id); - HashedKey key(segment_name); - std::shared_ptr cxt; - load(key, offset_in_source, source, cxt); + open(segment_name, file, id, source); } } -NexusFSIndex::LookupResult NexusFS::load(const HybridCache::HashedKey &key, off_t offset_in_source, std::unique_ptr &source, std::shared_ptr &insert_cxt) +std::tuple, std::shared_ptr, UInt64> +NexusFS::open(const String & segment_name, const String & file, const UInt64 segment_id, std::unique_ptr & source) { - LOG_TRACE(log, "try find {}({}) from index", key.key(), key.keyHash()); + LOG_TRACE(log, "try to find {} from index", segment_name); - auto lr = index.lookup(key.keyHash()); - if (!lr.isFound()) + auto seq_number = region_manager.getSeqNumber(); + auto handle = index.lookup(file, segment_id); + std::shared_ptr insert_cxt = nullptr; + if (!handle) { - LOG_TRACE(log, "{}({}) not find, read from source and insert to cache", key.key(), key.keyHash()); + LOG_TRACE(log, "{} not found, check InFlightInserts", segment_name); bool is_newly_created; - insert_cxt = in_flight_inserts.getOrCreateContext(key.keyHash(), is_newly_created); + insert_cxt = in_flight_inserts.getOrCreateContext(segment_name, is_newly_created); if (is_newly_created) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheMiss); + // double check index + handle = index.lookup(file, segment_id); + if (handle) + { + in_flight_inserts.removeContext(segment_name); + insert_cxt.reset(); + LOG_TRACE(log, "{} already inserted to index", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + else + { + LOG_TRACE(log, "create InsertCxt for {}, read from source and insert to cache", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSMiss); + + { + std::lock_guard lock(insert_cxt->mutex); + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + + insert_cxt->buffer = Buffer(segment_size); + off_t offset_in_source = getOffsetInSourceFile(segment_id); + size_t bytes_read + = source->readBigAt(reinterpret_cast(insert_cxt->buffer.data()), segment_size, offset_in_source); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset_in_source); + + insert_cxt->buffer.shrink(bytes_read); + insert_cxt->ready = true; + insert_cxt->cv.notify_all(); + } + + try + { + auto get_file_and_segment_size = [&source, this]() { return std::make_pair(source->getFileSize(), segment_size); }; + handle = insert(file, segment_id, insert_cxt->buffer.view(), get_file_and_segment_size); + } + catch (Exception & e) + { + in_flight_inserts.removeContext(segment_name); + throw e; + } + in_flight_inserts.removeContext(segment_name); + } + } + else + { + LOG_TRACE(log, "found InsertCxt for {}, wait and read from InsertCxt", segment_name); + ProfileEvents::increment(ProfileEvents::NexusFSHitInflightInsert); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + } + else + { + LOG_TRACE(log, "{} found, {}", segment_name, handle->toString()); + ProfileEvents::increment(ProfileEvents::NexusFSHit); + } + return std::make_tuple(handle, insert_cxt, seq_number); +} - insert_cxt->buffer = Buffer(segment_size); - size_t bytes_read = source->readBigAt(reinterpret_cast(insert_cxt->buffer.data()), segment_size, offset_in_source); - LOG_TRACE(log, "read {} bytes from source", bytes_read); +std::pair +NexusFS::readFromInsertCxtInternal(std::shared_ptr & cxt, const off_t offset_in_segment, const size_t max_size, char * to) const +{ + { + std::unique_lock lock(cxt->mutex); + auto timeout = std::chrono::system_clock::now() + std::chrono::milliseconds(timeout_ms / 3); + if (!cxt->cv.wait_until(lock, timeout, [&] { return cxt->ready; })) + { + // timeout, deep retry + return {OpResult::DEEP_RETRY, 0}; + } + } - insert_cxt->buffer.shrink(bytes_read); - insert_cxt->ready = true; + size_t size = getReadSizeInSegment(offset_in_segment, cxt->buffer.size(), max_size); + if (size == 0) + return {OpResult::SUCCESS, 0}; - lr = insert(key, insert_cxt->buffer.view()); - in_flight_inserts.removeContext(key.keyHash()); + memcpy(to, cxt->buffer.data() + offset_in_segment, size); - //TODO: insert into memory cache - } - else + return {OpResult::SUCCESS, size}; +} + +std::pair +NexusFS::readFromInsertCxtInternal(std::shared_ptr & cxt, const off_t offset_in_segment, const size_t max_size) const +{ + { + std::unique_lock lock(cxt->mutex); + auto timeout = std::chrono::system_clock::now() + std::chrono::milliseconds(timeout_ms / 3); + if (!cxt->cv.wait_until(lock, timeout, [&] { return cxt->ready; })) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHitInflightInsert); + // timeout, deep retry + return {OpResult::DEEP_RETRY, NexusFSBufferWithHandle()}; } } - else + + size_t size = getReadSizeInSegment(offset_in_segment, cxt->buffer.size(), max_size); + if (size == 0) + return {OpResult::SUCCESS, NexusFSBufferWithHandle()}; + + NexusFSBufferWithHandle bwh; + bwh.buffer + = std::make_unique>(size, reinterpret_cast(cxt->buffer.data() + offset_in_segment), 0); + bwh.buffer->buffer().resize(size); + bwh.insert_cxt = cxt; + + return {OpResult::SUCCESS, std::move(bwh)}; +} + +std::pair NexusFS::readFromBufferInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size, char * to) +{ + chassert(buffer_manager); + auto [op_result, buffer] = buffer_manager->pin(handle, seq_number); + if (op_result == OpResult::SUCCESS) { - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHit); + LOG_TRACE( + log, + "{} pinned, going to copy {} bytes from buffer({})", + handle->toString(), + size, + reinterpret_cast(buffer + offset_in_segment)); + chassert(buffer); + memcpy(to, reinterpret_cast(buffer + offset_in_segment), size); + handle->unpin(); + return {OpResult::SUCCESS, size}; + } + return {op_result, 0}; +} + +std::pair NexusFS::readFromBufferInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size) +{ + chassert(buffer_manager); + auto [op_result, buffer] = buffer_manager->pin(handle, seq_number); + if (op_result == OpResult::SUCCESS) + { + LOG_TRACE( + log, + "{} pinned, return a buffer({}) with {} bytes", + handle->toString(), + reinterpret_cast(buffer + offset_in_segment), + size); + chassert(buffer); + NexusFSBufferWithHandle bwh; + bwh.handle = handle; + bwh.buffer = std::make_unique>(size, reinterpret_cast(buffer + offset_in_segment), 0); + bwh.buffer->buffer().resize(size); + return {OpResult::SUCCESS, std::move(bwh)}; + } + return {op_result, NexusFSBufferWithHandle()}; +} + +std::pair NexusFS::readFromDiskInternal( + std::shared_ptr & handle, const UInt64 seq_number, const off_t offset_in_segment, const size_t size, char * to) +{ + if (!handle->isRelAddressValid()) + return {OpResult::DEEP_RETRY, 0}; + + auto addr = handle->getRelAddress(); + chassert(addr.rid().valid()); + RegionDescriptor desc = region_manager.openForRead(addr.rid(), seq_number); + switch (desc.getStatus()) + { + case OpenStatus::Retry: + if (region_manager.getSeqNumber() != seq_number) + return {OpResult::DEEP_RETRY, 0}; + else + return {OpResult::RETRY, 0}; + case OpenStatus::Error: + return {OpResult::ERROR, 0}; + case OpenStatus::Ready: + addr = addr.add(offset_in_segment); + size_t bytes_read = readEntry(desc, addr, size, to); + LOG_TRACE(log, "read {} bytes from disk, addr=<{},{}>", bytes_read, addr.rid().index(), addr.offset()); + if (bytes_read > 0) + { + region_manager.touch(addr.rid()); + } + region_manager.close(std::move(desc)); + return {OpResult::SUCCESS, bytes_read}; } - return lr; + return {OpResult::ERROR, 0}; // this line should not be reached } -size_t NexusFS::read(const String &file, const off_t offset, const size_t max_size, std::unique_ptr &source, char *to) +std::pair NexusFS::readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size, char * to) + const +{ + auto [op_result, bytes_read] = readFromInsertCxtInternal(cxt, offset_in_segment, max_size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtRetry); + std::tie(op_result, bytes_read) = readFromInsertCxtInternal(cxt, offset_in_segment, max_size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxt); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromInsertCxt failed when reading {}, cxt={}", + segment_name, + reinterpret_cast(cxt.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtDeepRetry); + return {true, 0}; + } +} + +std::pair NexusFS::readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const +{ + auto [op_result, bwh] = readFromInsertCxtInternal(cxt, offset_in_segment, max_size); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtRetry); + std::tie(op_result, bwh) = readFromInsertCxtInternal(cxt, offset_in_segment, max_size); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtNonCopy); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtNonCopyBytesRead, bwh.getSize()); + return {false, std::move(bwh)}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, NexusFSBufferWithHandle()}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromInsertCxt(non-copy) failed when reading {}, cxt={}", + segment_name, + reinterpret_cast(cxt.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromInsertCxtDeepRetry); + return {true, NexusFSBufferWithHandle()}; + } +} + +std::pair NexusFS::readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to) +{ + auto [op_result, bytes_read] = readFromBufferInternal(handle, seq_number, offset_in_segment, size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferRetry); + std::tie(op_result, bytes_read) = readFromBufferInternal(handle, seq_number, offset_in_segment, size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBuffer); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromBuffer failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferDeepRetry); + return {true, 0}; + } +} + +std::pair NexusFS::readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size) +{ + auto [op_result, bwh] = readFromBufferInternal(handle, seq_number, offset_in_segment, size); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferRetry); + std::tie(op_result, bwh) = readFromBufferInternal(handle, seq_number, offset_in_segment, size); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferNonCopy); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferNonCopyBytesRead, bwh.getSize()); + return {false, std::move(bwh)}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, NexusFSBufferWithHandle()}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "readFromBuffer(non-copy) failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromBufferDeepRetry); + return {true, NexusFSBufferWithHandle()}; + } +} + +std::pair NexusFS::readFromDisk( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to) +{ + auto [op_result, bytes_read] = readFromDiskInternal(handle, seq_number, offset_in_segment, size, to); + while (op_result == OpResult::RETRY && watch.elapsedMilliseconds() < timeout_ms) + { + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskRetry); + std::tie(op_result, bytes_read) = readFromDiskInternal(handle, seq_number, offset_in_segment, size, to); + } + switch (op_result) + { + case OpResult::SUCCESS: + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDisk); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskBytesRead, bytes_read); + return {false, bytes_read}; + case OpResult::RETRY: + ProfileEvents::increment(ProfileEvents::NexusFSTimeout); + return {true, 0}; + case OpResult::ERROR: + throw Exception( + ErrorCodes::CANNOT_OPEN_FILE, + "readFromDisk failed when reading {}, handle={}", + segment_name, + reinterpret_cast(handle.get())); + default: + chassert(op_result == OpResult::DEEP_RETRY); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromDiskDeepRetry); + return {true, 0}; + } +} + +size_t +NexusFS::read(const String & file, const off_t offset, const size_t max_size, std::unique_ptr & source, char * to) { UInt64 segment_id = getSegmentId(offset); String segment_name = getSegmentName(file, segment_id); - off_t offset_in_source = getOffsetInSourceFile(segment_id); off_t offset_in_segment = getOffsetInSegment(offset); - HashedKey key(segment_name); Stopwatch watch; UInt32 num_tries = 0; - for (; watch.elapsedMilliseconds() < timeout_ms; num_tries++) + while (watch.elapsedMilliseconds() < timeout_ms) { - if (num_tries > 0) - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheLookupRetries); + num_tries++; + if (num_tries > 1) + ProfileEvents::increment(ProfileEvents::NexusFSDeepRetry); - std::shared_ptr cxt; - const auto seq_number = region_manager.getSeqNumber(); - auto lr = load(key, offset_in_source, source, cxt); + auto [handle, cxt, seq_number] = open(segment_name, file, segment_id, source); - // read from InsertCxt if (cxt) { - Stopwatch watch_cxt; - while (!cxt->ready && watch_cxt.elapsedMilliseconds() < timeout_ms) - { - std::this_thread::yield(); - } - - if (!cxt->ready) - { - LOG_WARNING(log, "stop waiting for InsertCxt to get ready, because of timeout ({}ms)", timeout_ms); - break; - } - - size_t buffer_size = cxt->buffer.size(); - if (buffer_size == 0 || buffer_size <= static_cast(offset_in_segment)) - { - return 0; - } - size_t size = getReadSizeInSegment(offset_in_segment, buffer_size, max_size); - chassert(size > 0); - memcpy(to, cxt->buffer.data() + offset_in_segment, size); - - return size; + // read from InsertCxt + auto [should_retry, bytes_read] = readFromInsertCxt(watch, segment_name, cxt, offset_in_segment, max_size, to); + if (should_retry) + continue; + else + return bytes_read; } - chassert(lr.isFound()); - ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheHit); - - if (lr.getSize() == 0) + chassert(handle); + size_t size = getReadSizeInSegment(offset_in_segment, handle->getSize(), max_size); + if (size == 0) return 0; - // read from memory buffer or disk - if (enable_segment_cache) + if (enable_buffer) { - // // load into memory cache and read - // auto callback = [this, &lr]{ return std::make_shared(loadToMemoryCache(lr)); }; - // auto &handle = segment_cache.getOrSet(key.keyHash(), callback); - // auto view = handle.pinMemoryBuffer(); - // size_t size = getReadSizeInSegment(offset_in_segment, lr.getSize(), max_size); - // ProfileEvents::increment(ProfileEvents::NexusFSMemoryBufferBytesRead); - // memcpy(to, view.data() + offset_in_segment, size); - // handle.unpinMemoryBuffer(); - // return size; - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "memory buffer of NexusFS is not implemented"); + // read from memroy buffer + auto [should_retry, bytes_read] = readFromBuffer(watch, segment_name, handle, seq_number, offset_in_segment, size, to); + if (should_retry) + continue; + else + return bytes_read; } else { - // directly read disk file - ProfileEvents::increment(ProfileEvents::NexusFSMemoryBufferMiss); + // read from disk + auto [should_retry, bytes_read] = readFromDisk(watch, segment_name, handle, seq_number, offset_in_segment, size, to); + if (should_retry) + continue; + else + return bytes_read; + } + } - auto addr = lr.getAddress(); - RegionDescriptor desc = region_manager.openForRead(addr.rid(), seq_number); - if (desc.getStatus() == OpenStatus::Retry) - { - // retry, go back to the for loop + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, watch.elapsedMilliseconds()); + size_t bytes_read = source->readBigAt(to, max_size, offset); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset); + return bytes_read; +} + +NexusFSBufferWithHandle +NexusFS::read(const String & file, const off_t offset, const size_t max_size, std::unique_ptr & source) +{ + UInt64 segment_id = getSegmentId(offset); + String segment_name = getSegmentName(file, segment_id); + off_t offset_in_segment = getOffsetInSegment(offset); + + Stopwatch watch; + UInt32 num_tries = 0; + while (watch.elapsedMilliseconds() < timeout_ms) + { + num_tries++; + if (num_tries > 1) + ProfileEvents::increment(ProfileEvents::NexusFSDeepRetry); + + auto [handle, cxt, seq_number] = open(segment_name, file, segment_id, source); + + if (cxt) + { + // read from InsertCxt + auto [should_retry, bwh] = readFromInsertCxt(watch, segment_name, cxt, offset_in_segment, max_size); + if (should_retry) continue; - } - if (desc.getStatus() != OpenStatus::Ready) + else + return std::move(bwh); + } - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "fail to open region for read"); + chassert(handle); + size_t size = getReadSizeInSegment(offset_in_segment, handle->getSize(), max_size); + if (size == 0) + return NexusFSBufferWithHandle(); - addr = addr.add(offset_in_segment); - size_t size = getReadSizeInSegment(offset_in_segment, lr.getSize(), max_size); - size_t bytes_read = readEntry(desc, addr, size, to); - LOG_TRACE(log, "read {} bytes from {}({})", bytes_read, key.key(), key.keyHash()); - if (bytes_read > 0) + // read from memroy buffer + chassert(enable_buffer); + auto [should_retry, bwh] = readFromBuffer(watch, segment_name, handle, seq_number, offset_in_segment, size); + if (should_retry) + continue; + else + return std::move(bwh); + } + + LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, watch.elapsedMilliseconds()); + NexusFSBufferWithHandle bwh; + bwh.buffer = std::make_unique>(max_size, nullptr, 0); + + ProfileEventTimeIncrement source_watch(ProfileEvents::NexusFSReadFromSourceMicroseconds); + size_t bytes_read = source->readBigAt(bwh.buffer->position(), max_size, offset); + bwh.buffer->buffer().resize(bytes_read); + ProfileEvents::increment(ProfileEvents::NexusFSReadFromSourceBytesRead, bytes_read); + LOG_TRACE(log, "read {} bytes from source, key={}, offset={}", bytes_read, segment_name, offset); + return bwh; +} + +std::future +NexusFS::prefetchToBuffer(const String & file, off_t offset, size_t max_size, std::unique_ptr & source) +{ + auto promise = std::make_shared>(); + + if (support_prefetch) + { + getReadWorker().addTaskRemote([&file, &source, this, offset, max_size, promise]() { + try { - region_manager.touch(addr.rid()); + auto bwh = read(file, offset, max_size, source); + ProfileEvents::increment(ProfileEvents::NexusFSPrefetchToBuffer); + ProfileEvents::increment(ProfileEvents::NexusFSPrefetchToBufferBytesRead, bwh.getSize()); + promise->set_value(std::move(bwh)); } - region_manager.close(std::move(desc)); - - return bytes_read; - } + catch (Exception & e) + { + promise->set_exception(std::make_exception_ptr(e)); + } + }); } + else + promise->set_exception( + std::make_exception_ptr(Exception(ErrorCodes::NOT_IMPLEMENTED, "support_prefetch = false, prefetchToBuffer is not supported"))); - LOG_WARNING(log, "read tries for {} times and timeout ({}ms), read directly from source", num_tries, timeout_ms); - size_t bytes_read = source->readBigAt(to, max_size, offset); - LOG_TRACE(log, "read {} bytes from source", bytes_read); - return bytes_read; + return promise->get_future(); } -NexusFSIndex::LookupResult NexusFS::insert(const HashedKey &key, BufferView buf_view) +std::shared_ptr NexusFS::insert( + const String & file, UInt64 segment_id, BufferView buf_view, std::function()> get_file_and_segment_size) { size_t size = buf_view.size(); UInt32 aligned_size = alignedSize(size); if (size == 0) - return index.insert(key.keyHash(), RelAddress(), 0); + { + auto handle = std::make_shared(RelAddress(), 0); + index.insert(file, segment_id, handle, get_file_and_segment_size); + return handle; + } chassert(size > 0); chassert(size <= segment_size); @@ -415,77 +825,79 @@ NexusFSIndex::LookupResult NexusFS::insert(const HashedKey &key, BufferView buf_ if (desc.getStatus() == OpenStatus::Error) { ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheError); - LOG_ERROR(log, "failed to insert {}({}), size={}", key.key(), key.keyHash(), slot_size); - throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "failed to insert {}({}), size={}", key.key(), key.keyHash(), slot_size); + LOG_ERROR(log, "failed to insert {}#{}, size={}", file, segment_id, slot_size); + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "failed to insert {}#{}, size={}", file, segment_id, slot_size); } - writeEntry(addr, slot_size, key, buf_view); - auto lr = index.insert(key.keyHash(), addr, size); + chassert(addr.offset() + slot_size <= region_manager.regionSize()); + chassert(slot_size % alloc_align_size == 0ULL); + + auto handle = std::make_shared(addr, size); + writeEntry(handle, buf_view); + index.insert(file, segment_id, handle, get_file_and_segment_size); + + LOG_TRACE( + log, + "create {} for {}#{}, write to disk addr=<{},{}>, size={}, slot_size={}", + handle->toString(), + file, + segment_id, + addr.rid().index(), + addr.offset(), + buf_view.size(), + slot_size); + allocator.close(std::move(desc)); - return lr; + return handle; } -void NexusFS::writeEntry(RelAddress addr, UInt32 slot_size, const HashedKey &key, BufferView value) +void NexusFS::writeEntry(std::shared_ptr & handle, HybridCache::BufferView value) { - chassert(addr.offset() + slot_size <= region_manager.regionSize()); - chassert(slot_size % alloc_align_size == 0ULL); - - LOG_TRACE(log, "writeEntry rid={}, off={}, key={}({}), size={} ", addr.rid().index(), addr.offset(), key.key(), key.keyHash(), slot_size); + auto addr = handle->getRelAddress(); + chassert(addr.offset() + value.size() <= region_manager.regionSize()); auto rid = addr.rid(); auto & region = region_manager.getRegion(rid); region.writeToBuffer(addr.offset(), value); - region.addKey(key.keyHash()); + region.addHandle(handle); + num_segments++; ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesWrite, value.size()); } -size_t NexusFS::readEntry(const RegionDescriptor &rdesc, RelAddress addr, UInt32 size, char *to) +size_t NexusFS::readEntry(const RegionDescriptor & rdesc, RelAddress addr, UInt32 size, char * to) { chassert(addr.offset() + size <= region_manager.regionSize()); - LOG_TRACE(log, "readEntry rid={}, off={}, size={} ", addr.rid().index(), addr.offset(), size); + LOG_TRACE(log, "read from disk addr=<{},{}>, size={} ", addr.rid().index(), addr.offset(), size); ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesRead, size); return region_manager.read(rdesc, addr, size, to); } -std::shared_ptr NexusFS::makeReinsertionPolicy(const BlockCacheReinsertionConfig & reinsertion_config) -{ - auto hits_threshold = reinsertion_config.getHitsThreshold(); - if (hits_threshold) - return std::make_shared(hits_threshold, index); - - auto pct_threshold = reinsertion_config.getPctThreshold(); - if (pct_threshold) - return std::make_shared(pct_threshold); - - return reinsertion_config.getCustomPolicy(); -} - -UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView buffer) +UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView /*buffer*/) { UInt32 eviction_count = 0; auto & region = region_manager.getRegion(rid); - std::vector keys; - region.getKeys(keys); - chassert(region.getNumItems() == keys.size()); + std::vector> handles; + region.getHandles(handles); + chassert(region.getNumItems() == handles.size()); - for (auto key : keys) + for (auto & handle : handles) { - auto lr = index.lookup(key); - if (!lr.isFound()) + if (!handle) { - LOG_ERROR(log, "reclaim a key {} in from region {}, but it does not exist in index", key, rid.index()); + LOG_ERROR(log, "reclaim a handle in from region {}, but it is null", rid.index()); continue; } - auto addr = lr.getAddress(); - auto size = lr.getSize(); - BufferView value{size, buffer.data() + addr.offset()}; + // auto addr = handle->getRelAddress(); + // chassert(addr.rid().valid()); + // auto size = handle->getSize(); + // BufferView value{size, buffer.data() + addr.offset()}; - const auto reinsertion_res = reinsertOrRemoveItem(key, value, size, addr); + const auto reinsertion_res = reinsertOrRemoveItem(handle); switch (reinsertion_res) { case ReinsertionRes::kEvicted: @@ -501,10 +913,9 @@ UInt32 NexusFS::onRegionReclaim(RegionId rid, BufferView buffer) // if (destructor_callback && reinsertion_res == ReinsertionRes::kEvicted) // destructor_callback(key, value, DestructorEvent::Recycled); - } - region.resetKeys(); + region.resetHandles(); chassert(region.getNumItems() >= eviction_count); return eviction_count; } @@ -513,53 +924,35 @@ void NexusFS::onRegionCleanup(RegionId rid, BufferView /*buffer*/) { UInt32 eviction_count = 0; auto & region = region_manager.getRegion(rid); - std::vector keys; - region.getKeys(keys); - chassert(region.getNumItems() == keys.size()); + std::vector> handles; + region.getHandles(handles); + chassert(region.getNumItems() == handles.size()); - for (auto key : keys) + for (auto & handle : handles) { - auto lr = index.lookup(key); - if (!lr.isFound()) + if (!handle) { - LOG_ERROR(log, "reclaim a key {} in from region {}, but it does not exist in index", key, rid.index()); + LOG_ERROR(log, "cleanup a handle in from region {}, but it is null", rid.index()); continue; } - auto addr = lr.getAddress(); + // auto addr = handle->getRelAddress(); + // chassert(addr.rid().valid()); - auto remove_res = removeItem(key, addr); - - if (remove_res) - eviction_count++; + removeItem(handle); + eviction_count++; // if (destructor_callback && remove_res) // destructor_callback(key, value, DestructorEvent::Recycled); } - region.resetKeys(); + region.resetHandles(); chassert(region.getNumItems() >= eviction_count); } -NexusFS::ReinsertionRes NexusFS::reinsertOrRemoveItem(UInt64 key, BufferView /*value*/, UInt32 /*entry_size*/, RelAddress addr) +NexusFS::ReinsertionRes NexusFS::reinsertOrRemoveItem(std::shared_ptr & handle) { - auto remove_item = [this, key, addr](bool /*expired*/) { - if (index.removeIfMatch(key, addr)) - { - // if (expired) - // ProfileEvents::increment(ProfileEvents::BlockCacheEvictionExpiredCount); - return ReinsertionRes::kEvicted; - } - return ReinsertionRes::kRemoved; - }; - - const auto lr = index.peek(key); - if (!lr.isFound() || lr.getAddress() != addr) - { - // ProfileEvents::increment(ProfileEvents::BlockCacheEvictionLookupMissCount); - return ReinsertionRes::kRemoved; - } - - return remove_item(true); + removeItem(handle); + return ReinsertionRes::kRemoved; } Protos::NexusFSConfig NexusFS::serializeConfig(const NexusFSConfig & config) @@ -571,63 +964,74 @@ Protos::NexusFSConfig NexusFS::serializeConfig(const NexusFSConfig & config) serialized_config.set_alloc_align_size(config.alloc_align_size); serialized_config.set_region_size(config.region_size); serialized_config.set_segment_size(config.segment_size); - + serialized_config.set_file_prefix(config.file_prefix); + serialized_config.set_file_surfix(config.file_surfix); + return serialized_config; } -bool NexusFS::removeItem(UInt64 key, RelAddress addr) +void NexusFS::removeItem(std::shared_ptr & handle) { - return index.removeIfMatch(key, addr); + num_segments--; + handle->invalidRelAddress(); + LOG_TRACE(log, "invalid {} because of removeItem", handle->toString()); } void NexusFS::persist() { - LOG_INFO(log, "Starting block cache persist"); + LOG_INFO(log, "Starting NexusFS persist"); auto stream = createMetadataOutputStream(*device, metadata_size); Protos::NexusFSConfig config = serialized_config; - config.set_alloc_align_size(alloc_align_size); - config.set_region_size(region_size); - config.set_segment_size(segment_size); - config.set_reinsertion_policy_enabled(reinsertion_policy != nullptr); google::protobuf::io::CodedOutputStream ostream(stream.get()); google::protobuf::util::SerializeDelimitedToCodedStream(config, &ostream); region_manager.persist(&ostream); index.persist(&ostream); - LOG_INFO(log, "Finished block cache persist"); + LOG_INFO(log, "Finished NexusFS persist"); } bool NexusFS::recover() { - LOG_INFO(log, "Starting block cache recovery"); + LOG_INFO(log, "Starting NexusFS recovery"); reset(); + bool recovered = false; try { auto stream = createMetadataInputStream(*device, metadata_size); + if (!stream) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Failed to createMetadataInputStream"); Protos::NexusFSConfig config; google::protobuf::io::CodedInputStream istream(stream.get()); google::protobuf::util::ParseDelimitedFromCodedStream(&config, &istream, nullptr); if (config.cache_size() != serialized_config.cache_size() || config.metadata_size() != serialized_config.metadata_size() || config.region_size() != serialized_config.region_size() || config.segment_size() != serialized_config.segment_size() - || config.version() != serialized_config.version() || config.alloc_align_size() != serialized_config.alloc_align_size()) + || config.version() != serialized_config.version() || config.alloc_align_size() != serialized_config.alloc_align_size() + || config.file_prefix() != serialized_config.file_prefix() || config.file_surfix() != serialized_config.file_surfix()) { LOG_ERROR(log, "Recovery config: {}", config.DebugString()); throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Recovery config does not match cache config"); } region_manager.recover(&istream); - index.recover(&istream); + index.recover(&istream, region_manager, num_segments); + + // successful recovery, invalid current metadata + auto output_stream = createMetadataOutputStream(*device, metadata_size); + if (!output_stream) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Failed to createMetadataOutputStream"); + recovered = output_stream->invalidate(); } catch (const std::exception & e) { LOG_ERROR(log, "Exception: {}", e.what()); - LOG_ERROR(log, "Failed to recover block cache. Resetting cache."); + LOG_ERROR(log, "Failed to recover NexusFS. Resetting cache."); reset(); return false; } - LOG_INFO(log, "Finished block cache recovery"); - return true; + if (recovered) + LOG_INFO(log, "Finished NexusFS recovery. Recover {} inodes, {} files, {} segments", index.getNumInodes(), index.getNumFileMetas(), num_segments); + return recovered; } @@ -669,33 +1073,4 @@ bool NexusFS::shutDown() return true; } -// std::shared_ptr NexusFS::loadToMemoryCache(const NexusFSIndex::LookupResult &lr) -// { -// RelAddress addr = lr.getAddress(); -// size_t size = lr.getSize(); -// RegionDescriptor desc(OpenStatus::Retry); -// while (desc.getStatus() == OpenStatus::Retry) -// { -// const auto seq_number = region_manager.getSeqNumber(); //TODO: why we need this? -// desc = region_manager.openForRead(addr.rid(), seq_number); -// } -// if (desc.getStatus() != OpenStatus::Ready) -// { -// // TODO: err codes -// throw Exception("fail to open region for read", ErrorCodes::BAD_ARGUMENTS); -// } -// chassert(desc.getStatus() == OpenStatus::Ready); - -// Buffer buffer(size); -// size_t bytes_read = readEntry(desc, addr, size, reinterpret_cast(buffer.data())); -// chassert(size == bytes_read); -// LOG_TRACE(log, "loadToMemoryCache, read {} bytes from addr=<{},{}>", bytes_read, addr.rid().index(), addr.offset()); -// region_manager.touch(addr.rid()); -// region_manager.close(std::move(desc)); - -// auto handle = lr.getHandler(); -// handle->loadedToMemory(buffer); -// return handle; -// } - } diff --git a/src/Storages/NexusFS/NexusFS.h b/src/Storages/NexusFS/NexusFS.h index 0cdcbbca17..dc427efc26 100644 --- a/src/Storages/NexusFS/NexusFS.h +++ b/src/Storages/NexusFS/NexusFS.h @@ -1,37 +1,42 @@ #pragma once #include +#include #include #include #include #include -#include +#include #include #include #include #include #include +#include #include #include +#include #include +#include #include #include #include #include -#include -#include -#include +#include +#include #include +#include +#include #include #include -#include -#include namespace DB { +class NexusFSBufferWithHandle; + class NexusFSConfig { public: @@ -50,7 +55,6 @@ class NexusFSConfig UInt64 cache_size{10 * GiB}; std::unique_ptr eviction_policy; - BlockCacheReinsertionConfig reinsertion_config{}; // Region size UInt64 region_size{1 * MiB}; @@ -60,6 +64,8 @@ class NexusFSConfig UInt32 io_align_size{1}; UInt32 stripe_size{4096}; + UInt32 reader_threads{4}; + UInt32 clean_regions_pool{2}; UInt32 clean_region_threads{2}; @@ -68,15 +74,25 @@ class NexusFSConfig UInt16 num_priorities{1}; - UInt64 memory_cache_size{1 * GiB}; + bool enable_memory_buffer{false}; + bool support_prefetch{true}; + UInt64 memory_buffer_size{1 * GiB}; + double memory_buffer_cooling_percent{0.1}; + double memory_buffer_freed_percent{0.05}; UInt32 timeout_ms{10000}; + UInt32 filemeta_gc_interval_s{300}; + + String file_prefix; + String file_surfix; // Calculate the total region number. UInt32 getNumberRegions() const { + chassert(0ul == metadata_size % region_size); chassert(0ul == cache_size % region_size); - return cache_size / region_size; + chassert(cache_size > metadata_size); + return (cache_size - metadata_size) / region_size; } void loadFromConfig(const Poco::Util::AbstractConfiguration & conf); @@ -85,7 +101,7 @@ class NexusFSConfig LoggerPtr log = getLogger("NexusFSConfig"); NexusFSConfig & validate(); - File openFile(const std::string & file_name, UInt64 size, bool truncate); + File openFile(const std::string & file_name, UInt64 size, bool truncate, bool direct_io); }; @@ -96,10 +112,18 @@ class NexusFS { off_t offset; size_t size; - OffsetAndSize(off_t offset_, size_t size_) : offset(offset_), size(size_) {} + OffsetAndSize(off_t offset_, size_t size_) : offset(offset_), size(size_) { } }; using OffsetAndSizeVector = std::vector; + struct InsertCxt + { + std::mutex mutex; + std::condition_variable cv; + HybridCache::Buffer buffer; + bool ready = false; + }; + explicit NexusFS(NexusFSConfig && config); NexusFS(const NexusFS &) = delete; NexusFS & operator=(const NexusFS &) = delete; @@ -108,11 +132,31 @@ class NexusFS UInt64 getSize() const { return region_manager.getSize(); } UInt64 getSegmentSize() const { return segment_size; } + bool supportNonCopyingRead() const { return enable_buffer; } + bool supportPrefetch() const { return support_prefetch; } + String getFilePrefix() const { return file_prefix; } + String getFileSurfix() const { return file_surfix; } + UInt64 getNumSegments() const { return num_segments.load(); } + UInt64 getNumInodes() const { return index.getNumInodes(); } + UInt64 getNumFileMetas() const { return index.getNumFileMetas(); } + std::vector getFileCachedStates() { return index.getFileCachedStates(); } + + HybridCache::FiberThread & getReadWorker() + { + return *(reader_workers[reader_task_counter.fetch_add(1, std::memory_order_relaxed) % reader_workers.size()]); + } + + // preload an array of segments to disk cache + void preload(const String & file, const OffsetAndSizeVector & offsets_and_sizes, std::unique_ptr & source); + // read from nexusfs + size_t read(const String & file, off_t offset, size_t max_size, std::unique_ptr & source, char * to); - void preload(const String &file, const OffsetAndSizeVector &offsets_and_sizes, std::unique_ptr &source); - size_t read(const String &file, off_t offset, size_t max_size, std::unique_ptr &source, char *to); + // read from nexusfs (non-copy) + NexusFSBufferWithHandle read(const String & file, off_t offset, size_t max_size, std::unique_ptr & source); + std::future + prefetchToBuffer(const String & file, off_t offset, size_t max_size, std::unique_ptr & source); void flush(); void drain(); @@ -122,24 +166,17 @@ class NexusFS bool recover(); private: - - struct InsertCxt - { - HybridCache::Buffer buffer; - bool ready = false; //TODO: use waiter - }; - class InFlightInserts { public: - std::shared_ptr getOrCreateContext(UInt32 key, bool &is_newly_created) + std::shared_ptr getOrCreateContext(const String & file_and_segment_id, bool & is_newly_created) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - auto it = map.find(key); + auto it = map.find(file_and_segment_id); if (it != map.end()) { is_newly_created = false; @@ -147,40 +184,40 @@ class NexusFS } is_newly_created = true; auto cxt = std::make_shared(); - map[key] = cxt; + map[file_and_segment_id] = cxt; return cxt; } } - std::shared_ptr getContext(UInt32 key) + std::shared_ptr getContext(const String & file_and_segment_id) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - auto it = map.find(key); + auto it = map.find(file_and_segment_id); if (it != map.end()) return it->second; return nullptr; } } - void removeContext(UInt32 key) + void removeContext(const String & file_and_segment_id) { - auto shard = key % kShards; - auto &mutex = mutexs[shard]; - auto &map = maps[shard]; + auto shard = std::hash()(file_and_segment_id) % kShards; + auto & mutex = mutexs[shard]; + auto & map = maps[shard]; { std::lock_guard guard{mutex}; - map.erase(key); + map.erase(file_and_segment_id); } } private: static constexpr UInt32 kShards = 8192; std::array mutexs; - std::array>, kShards> maps; + std::array>, kShards> maps; }; @@ -188,17 +225,74 @@ class NexusFS static constexpr UInt16 kDefaultItemPriority = 0; UInt64 getSegmentId(const off_t offset) const { return offset / segment_size; } - static String getSegmentName(const String file, const UInt64 segment_id) { return file + "#" + std::to_string(segment_id); } + static String getSegmentName(const String file, const UInt64 segment_id) { return file + "#" + std::to_string(segment_id); } off_t getOffsetInSourceFile(const UInt64 segment_id) const { return segment_id * segment_size; } off_t getOffsetInSegment(const off_t file_offset) const { return file_offset % segment_size; } - static size_t getReadSizeInSegment(const off_t offset_in_segemt, const size_t segment_size, const size_t buffer_size) { return std::min(buffer_size, segment_size - offset_in_segemt); } + static size_t getReadSizeInSegment(const off_t offset_in_segemt, const size_t segment_size, const size_t buffer_size) + { + return segment_size >= static_cast(offset_in_segemt) ? std::min(buffer_size, segment_size - offset_in_segemt) : 0; + } UInt32 alignedSize(UInt32 size) const { return roundup(size, alloc_align_size); } - NexusFSComponents::NexusFSIndex::LookupResult load(const HybridCache::HashedKey &key, off_t offset_in_source, std::unique_ptr &source, std::shared_ptr &insert_cxt); - - void writeEntry(HybridCache::RelAddress addr, UInt32 slot_size, const HybridCache::HashedKey &key, HybridCache::BufferView value); - - size_t readEntry(const HybridCache::RegionDescriptor &desc, HybridCache::RelAddress addr, UInt32 size, char *to); + std::tuple, std::shared_ptr, UInt64> open( + const String & segment_name, + const String & file, + UInt64 segment_id, + std::unique_ptr & source); + + std::pair + readFromInsertCxtInternal(std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size, char * to) const; + std::pair + readFromInsertCxtInternal(std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const; + std::pair readFromBufferInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size, char * to); + std::pair readFromBufferInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size); + std::pair readFromDiskInternal( + std::shared_ptr & handle, UInt64 seq_number, off_t offset_in_segment, size_t size, char * to); + + // read form insert_cxt + std::pair readFromInsertCxt( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & cxt, + off_t offset_in_segment, + size_t max_size, + char * to) const; + // read form insert_cxt (non-copy) + std::pair readFromInsertCxt( + Stopwatch & watch, const String & segment_name, std::shared_ptr & cxt, off_t offset_in_segment, size_t max_size) const; + // read from buffer + std::pair readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to); + // read from buffer (non-copy) + std::pair readFromBuffer( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size); + // read from disk + std::pair readFromDisk( + Stopwatch & watch, + const String & segment_name, + std::shared_ptr & handle, + UInt64 seq_number, + off_t offset_in_segment, + size_t size, + char * to); + + + void writeEntry(std::shared_ptr & handle, HybridCache::BufferView value); + + size_t readEntry(const HybridCache::RegionDescriptor & desc, HybridCache::RelAddress addr, UInt32 size, char * to); UInt32 onRegionReclaim(HybridCache::RegionId rid, HybridCache::BufferView buffer); @@ -212,15 +306,15 @@ class NexusFS kRemoved, kEvicted, }; - ReinsertionRes reinsertOrRemoveItem(UInt64 key, HybridCache::BufferView value, UInt32 entry_size, HybridCache::RelAddress addr); + ReinsertionRes reinsertOrRemoveItem(std::shared_ptr & handle); - bool removeItem(UInt64 key, HybridCache::RelAddress addr); + void removeItem(std::shared_ptr & handle); - std::shared_ptr makeReinsertionPolicy(const BlockCacheReinsertionConfig & reinsertion_config); - - NexusFSComponents::NexusFSIndex::LookupResult insert(const HybridCache::HashedKey &key, HybridCache::BufferView buf_view); - - // std::shared_ptr loadToMemoryCache(const NexusFSComponents::NexusFSIndex::LookupResult &lr); + std::shared_ptr insert( + const String & file, + UInt64 segment_id, + HybridCache::BufferView buf_view, + std::function()> get_file_and_segment_size); LoggerPtr log = getLogger("NexusFS"); @@ -236,20 +330,24 @@ class NexusFS const std::unique_ptr device; const UInt32 alloc_align_size{}; - const UInt64 region_size{}; const UInt64 metadata_size{}; const UInt32 segment_size{}; const UInt32 timeout_ms{}; - NexusFSComponents::NexusFSIndex index; + const String file_prefix; + const String file_surfix; + NexusFSComponents::InodeManager index; HybridCache::RegionManager region_manager; HybridCache::Allocator allocator; - std::shared_ptr reinsertion_policy; - InFlightInserts in_flight_inserts; - const bool enable_segment_cache; - // NexusFSComponents::SegmentCacheLRU segment_cache; + const bool enable_buffer; + const bool support_prefetch; + NexusFSComponents::BufferManager * buffer_manager; + std::vector> reader_workers; + mutable std::atomic reader_task_counter{0}; + + std::atomic num_segments{0}; }; } diff --git a/src/Storages/NexusFS/NexusFSBuffer.cpp b/src/Storages/NexusFS/NexusFSBuffer.cpp new file mode 100644 index 0000000000..2c29368d5b --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBuffer.cpp @@ -0,0 +1,426 @@ +#include + +#include + +#include +#include +#include +#include +#include "Common/ProfileEvents.h" +#include +#include +#include +#include + + +namespace ProfileEvents +{ +extern const Event NexusFSBufferHit; +extern const Event NexusFSBufferMiss; +extern const Event NexusFSBufferPreload; +extern const Event NexusFSBufferPreloadRetry; +extern const Event NexusFSBufferEmptyCoolingQueue; +extern const Event NexusFSDiskCacheBytesRead; +} + +namespace DB::ErrorCodes +{ +extern const int CANNOT_OPEN_FILE; +} + +namespace DB::NexusFSComponents +{ + +void BufferState::loadAndPin(const std::unique_lock &, std::shared_ptr & handle_) +{ + chassert(state == State::COLD); + chassert(!handle); + state = State::HOT; + reader = 1; + handle = handle_; +} + +void BufferState::pin(const std::unique_lock &) +{ + chassert(handle); + state = State::HOT; + reader++; +} + +void BufferState::unpin(const std::unique_lock & l) +{ + if (state != BufferState::State::HOT || reader == 0 || !handle) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "try to unpin a {} whose has invalid {}", + handle ? handle->toStringSimple() : "BlockHandle(nullptr)", + toString(l)); + reader--; +} + +bool BufferState::markCooling(const std::unique_lock &) +{ + if (!handle) + return false; + if (state == State::HOT && reader == 0) + { + state = State::COOLING; + return true; + } + return false; +} + +bool BufferState::tryUnload(const std::unique_lock &) +{ + if (!handle) + return false; + if (state == State::COOLING) + { + if (reader != 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "try to unload a cooling {} whose reader={}", handle->toStringSimple(), reader); + state = State::COLD; + reader = 0; + handle->resetBufferSlot(); + handle.reset(); + return true; + } + return false; +} + +String BufferState::toString(const std::unique_lock &) +{ + return fmt::format( + "BufferState(state={}, reader={}, buffer={}, handle={})", + static_cast(state), + reader, + reinterpret_cast(buffer), + reinterpret_cast(handle.get())); +} + + +BufferState * BlockHandle::getBuffer(const std::unique_lock & lock) +{ + auto slot_id = buffer_slot.load(); + auto * buffer_manager = BufferManager::getInstance(); + chassert(slot_id != INVALID_SLOT_ID); + chassert(buffer_manager); + chassert(&buffer_manager->getMetaMutex(slot_id) == lock.mutex()); + return &buffer_manager->getMetaState(slot_id); +} + +BufferState * BlockHandle::getBufferStateAndLock(std::unique_lock & lock) +{ + auto slot_id = buffer_slot.load(); + while (true) + { + if (slot_id == INVALID_SLOT_ID) + return nullptr; + auto * buffer_manager = BufferManager::getInstance(); + chassert(buffer_manager); + lock = std::unique_lock{buffer_manager->getMetaMutex(slot_id)}; + auto recheck_slot_id = buffer_slot.load(); + if (recheck_slot_id == slot_id) + return &buffer_manager->getMetaState(slot_id); + else + { + lock.unlock(); + lock.release(); + slot_id = recheck_slot_id; + } + } +} + +bool BlockHandle::setBufferSlot(SlotId slot_id) +{ + auto expected = INVALID_SLOT_ID; + return buffer_slot.compare_exchange_strong(expected, slot_id); +} + + +void BlockHandle::unpin() +{ + std::unique_lock lock; + auto * state = getBufferStateAndLock(lock); + chassert(state); + state->unpin(lock); +} + +String BlockHandle::toString() +{ + std::unique_lock lock; + auto * state = getBufferStateAndLock(lock); + if (state) + { + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + state->toString(lock), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); + } + else + { + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state=null, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); + } +} + +String BlockHandle::toString(const std::unique_lock & lock) +{ + auto * state = getBuffer(lock); + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, state={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + state->toString(lock), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); +} + +String BlockHandle::toStringSimple() const +{ + auto laddr = addr.load(); + return fmt::format( + "BlockHandle({}, buffer_slot={}, valid={}, addr=<{},{}>, size={})", + reinterpret_cast(this), + buffer_slot.load(), + laddr.rid().valid(), + laddr.rid().index(), + laddr.offset(), + size); +} + + +std::unique_ptr BufferManager::buffer_manager = nullptr; + +BufferManager * BufferManager::initInstance( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_) +{ + if (buffer_manager) + throw Exception(ErrorCodes::LOGICAL_ERROR, "BufferManager already initialized"); + buffer_manager = std::make_unique( + buffer_size_, segment_size_, filemate_gc_interval_, cooling_percentage, freed_percentage, region_manager_, inode_manager_); + return buffer_manager.get(); +} + +BufferManager * BufferManager::getInstance() +{ + return buffer_manager.get(); +} + +BufferManager::BufferManager( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_) + : buffer_size(buffer_size_) + , slot_size(buffer_size_ / segment_size_) + , cooling_size(slot_size * cooling_percentage) + , freed_size(slot_size * freed_percentage) + , segment_size(segment_size_) + , filemate_gc_interval(filemate_gc_interval_) + , region_manager(region_manager_) + , inode_manager(inode_manager_) + , base_data(reinterpret_cast(mmap(nullptr, buffer_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0))) + , meta_locks(slot_size) + , free_list(folly::MPMCQueue(slot_size)) +{ + chassert(buffer_size_ % segment_size_ == 0); + chassert(base_data); + chassert(base_data % 4096 == 0); + + meta_states.reserve(slot_size); + for (size_t i = 0; i < slot_size; i++) + meta_states.emplace_back(BufferState(calculateBuffer(i))); + + for (SlotId i = 0; i < slot_size; i++) + free_list.write(i); + + cooling_and_gc_thread = std::thread([this] { coolDownBlocksAndGC(); }); +} + +BufferManager::~BufferManager() +{ + stop_cooling_and_gc.store(true, std::memory_order_relaxed); + cooling_and_gc_thread.join(); + + chassert(base_data); + munmap(reinterpret_cast(base_data), buffer_size); +} + +std::pair BufferManager::pin(std::shared_ptr & handle, UInt64 seq_number) +{ + std::unique_lock lock; + auto * state = handle->getBufferStateAndLock(lock); + if (!state) + { + ProfileEvents::increment(ProfileEvents::NexusFSBufferMiss); + return loadAndPin(handle, seq_number); + } + + ProfileEvents::increment(ProfileEvents::NexusFSBufferHit); + state->pin(lock); + auto buffer = state->getBuffer(); + chassert(buffer != 0); + return {OpResult::SUCCESS, buffer}; +} + +std::pair BufferManager::alloc() +{ + SlotId id; + if (!free_list.read(id)) + { + ProfileEvents::increment(ProfileEvents::NexusFSBufferEmptyCoolingQueue); + std::this_thread::yield(); + return {OpResult::RETRY, 0}; + } + LOG_TRACE(log, "erase slot {} from free_list", id); + return {OpResult::SUCCESS, id}; +} + +void BufferManager::free(SlotId slot_id) +{ + free_list.write(slot_id); + LOG_TRACE(log, "insert slot {} to free_list", slot_id); +} + +std::pair BufferManager::loadAndPin(std::shared_ptr & handle, const UInt64 seq_number) +{ + if (!handle->isRelAddressValid()) + return {OpResult::DEEP_RETRY, 0}; + + auto [op_result, slot_id] = alloc(); + if (op_result != OpResult::SUCCESS) + return {op_result, 0}; + + std::unique_lock l(meta_locks[slot_id]); + if (!handle->setBufferSlot(slot_id)) + { + LOG_TRACE( + log, + "try to set slot {} to BlockHandle({}, slot={}), but failed", + slot_id, + reinterpret_cast(handle.get()), + handle->getBufferSlot()); + free(slot_id); + return {OpResult::DEEP_RETRY, 0}; + } + + RelAddress addr = handle->getRelAddress(); + size_t size = handle->getSize(); + chassert(addr.rid().valid()); + chassert(size > 0); + + auto desc = region_manager.openForRead(addr.rid(), seq_number); + if (desc.getStatus() != HybridCache::OpenStatus::Ready) + { + handle->resetBufferSlot(); + free(slot_id); + if (desc.getStatus() == HybridCache::OpenStatus::Retry) + return {OpResult::DEEP_RETRY, 0}; + if (desc.getStatus() == HybridCache::OpenStatus::Error) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "fail to open region for read"); + } + + auto & state = getMetaState(slot_id); + chassert(!state.getHandle()); + uintptr_t buffer = state.getBuffer(); + chassert(buffer); + size_t bytes_read = region_manager.read(desc, addr, size, reinterpret_cast(buffer)); + chassert(size == bytes_read); + LOG_TRACE( + log, + "read {} bytes from disk(rid={}, offset={}, size={}) to buffer {}(slot={})", + bytes_read, + addr.rid().index(), + addr.offset(), + size, + reinterpret_cast(buffer), + slot_id); + ProfileEvents::increment(ProfileEvents::NexusFSDiskCacheBytesRead, size); + + region_manager.touch(addr.rid()); + region_manager.close(std::move(desc)); + + state.loadAndPin(l, handle); + + LOG_TRACE(log, "{} loadAndPin, buffer {}(slot={})", handle->toString(l), reinterpret_cast(buffer), slot_id); + + return {OpResult::SUCCESS, buffer}; +} + +void BufferManager::coolDownBlocksAndGC() +{ + SlotId cooling_itr = 0; + Stopwatch watch; + while (!stop_cooling_and_gc.load(std::memory_order_relaxed)) + { + size_t current_freed = free_list.size(); + if (current_freed >= freed_size) + { + if (watch.elapsedSeconds() >= filemate_gc_interval) + { + watch.restart(); + inode_manager.cleanInvalidFiles(); + } + else + std::this_thread::yield(); + continue; + } + + while (cooling_queue.size() < cooling_size) + { + std::unique_lock l(meta_locks[cooling_itr]); + if (meta_states[cooling_itr].markCooling(l)) + { + auto handle = meta_states[cooling_itr].getHandle(); + chassert(handle); + LOG_TRACE(log, "{} on slot {} turns cooling", handle->toString(l), cooling_itr); + cooling_queue.push(cooling_itr); + } + cooling_itr = (cooling_itr + 1) % slot_size; + } + + while (current_freed < freed_size && !cooling_queue.empty()) + { + SlotId id = cooling_queue.front(); + cooling_queue.pop(); + + std::unique_lock l(meta_locks[id]); + auto handle = meta_states[id].getHandle(); + if (meta_states[id].tryUnload(l)) + { + LOG_TRACE( + log, + "BlockHandle({}) unloaded, {}(slot={}) retrived", + reinterpret_cast(handle.get()), + meta_states[id].toString(l), + id); + free(id); + current_freed++; + } + } + } +} + +} diff --git a/src/Storages/NexusFS/NexusFSBuffer.h b/src/Storages/NexusFS/NexusFSBuffer.h new file mode 100644 index 0000000000..b6c80a2654 --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBuffer.h @@ -0,0 +1,186 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB::NexusFSComponents +{ + +using Mutex = folly::fibers::TimedMutex; +using SlotId = UInt32; +using HybridCache::RelAddress; +class InodeManager; +class BlockHandle; +class BufferManager; + +constexpr UInt32 INVALID_SLOT_ID = UINT32_MAX; + +enum class OpResult : UInt16 +{ + SUCCESS, + RETRY, + DEEP_RETRY, + ERROR +}; + +class BufferState +{ +public: + explicit BufferState(uintptr_t buffer_) : handle(nullptr), buffer(buffer_), reader(0), state(State::COLD) + { + chassert(buffer != 0); + } + + void loadAndPin(const std::unique_lock &, std::shared_ptr & handle); + + void pin(const std::unique_lock &l); + void unpin(const std::unique_lock &l); + + bool markCooling(const std::unique_lock &); + + bool tryUnload(const std::unique_lock &); + + std::shared_ptr getHandle() const { return handle; } + uintptr_t getBuffer() const { return buffer; } + UInt16 getReader() const { return reader; } + + String toString(const std::unique_lock &); + +private: + enum class State : UInt8 + { + HOT, + COOLING, + COLD + }; + + std::shared_ptr handle{nullptr}; + const uintptr_t buffer{0}; + UInt16 reader{0}; + State state{State::COLD}; +}; + + +class BlockHandle +{ +public: + explicit BlockHandle(RelAddress addr_, UInt32 size_) : addr(addr_), size(size_) { } + BlockHandle(const BlockHandle &) = delete; + BlockHandle & operator=(const BlockHandle &) = delete; + + UInt32 getSize() const { return size; } + RelAddress getRelAddress() const { return addr; } + bool isRelAddressValid() const { return addr.load().rid().valid(); } + void invalidRelAddress() { addr.store(RelAddress()); } + SlotId getBufferSlot() const { return buffer_slot.load(); } + void resetBufferSlot() { buffer_slot.store(INVALID_SLOT_ID); } + + BufferState * getBuffer(const std::unique_lock & lock); + BufferState * getBufferStateAndLock(std::unique_lock & lock); + + bool setBufferSlot(SlotId slot_id); + + void unpin(); + + String toString(); + String toString(const std::unique_lock &); + String toStringSimple() const; + +private: + friend class BufferManager; + + std::atomic addr{RelAddress()}; + const UInt32 size{0}; + std::atomic buffer_slot{INVALID_SLOT_ID}; +}; + + +class BufferManager +{ +public: + static std::unique_ptr buffer_manager; + static BufferManager * initInstance( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_ + ); + static BufferManager * getInstance(); + + explicit BufferManager( + size_t buffer_size_, + UInt32 segment_size_, + UInt32 filemate_gc_interval_, + double cooling_percentage, + double freed_percentage, + HybridCache::RegionManager & region_manager_, + InodeManager & inode_manager_); + ~BufferManager(); + BufferManager(const BufferManager &) = delete; + BufferManager & operator=(const BufferManager &) = delete; + + std::pair pin(std::shared_ptr & handle, UInt64 seq_number); + +private: + friend class BlockHandle; + + Mutex & getMetaMutex(SlotId id) + { + chassert(id < slot_size); + return meta_locks[id]; + } + BufferState & getMetaState(SlotId id) + { + chassert(id < slot_size); + return meta_states[id]; + } + + uintptr_t calculateBuffer(SlotId slot_id) const { return base_data + static_cast(slot_id) * segment_size; } + + std::pair alloc(); + void free(SlotId slot_id); + + std::pair loadAndPin(std::shared_ptr & handle, UInt64 seq_number); + + void coolDownBlocksAndGC(); + + LoggerPtr log = getLogger("NexusFSBufferManager"); + + const size_t buffer_size{}; + const size_t slot_size{}; + const size_t cooling_size{}; + const size_t freed_size{}; + const UInt32 segment_size{}; + const UInt32 filemate_gc_interval{}; + + HybridCache::RegionManager & region_manager; + InodeManager & inode_manager; + + const uintptr_t base_data; + + // TODO: optimize with lock manager + std::vector meta_locks; + std::vector meta_states; + + std::queue cooling_queue; + folly::MPMCQueue free_list; + + std::atomic stop_cooling_and_gc{false}; + std::thread cooling_and_gc_thread; +}; + +} diff --git a/src/Storages/NexusFS/NexusFSBufferWithHandle.h b/src/Storages/NexusFS/NexusFSBufferWithHandle.h new file mode 100644 index 0000000000..5c84a28a55 --- /dev/null +++ b/src/Storages/NexusFS/NexusFSBufferWithHandle.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class NexusFSBufferWithHandle +{ +public: + NexusFSBufferWithHandle() = default; + NexusFSBufferWithHandle(NexusFSBufferWithHandle && other) noexcept + : handle(std::move(other.handle)), buffer(std::move(other.buffer)), insert_cxt(std::move(other.insert_cxt)) + { + } + NexusFSBufferWithHandle & operator=(NexusFSBufferWithHandle && other) noexcept + { + if (this == &other) + return *this; + + reset(); + swap(handle, other.handle); + swap(buffer, other.buffer); + swap(insert_cxt, other.insert_cxt); + return *this; + } + ~NexusFSBufferWithHandle() { reset(); } + + void reset() + { + if (handle) + { + handle->unpin(); + handle.reset(); + } + if (buffer) + buffer.reset(); + if (insert_cxt) + insert_cxt.reset(); + } + + size_t getSize() const { return buffer ? buffer->available() : 0; } + BufferBase::Position getData() { return buffer ? buffer->position() : nullptr; } + +private: + friend class NexusFS; + + std::shared_ptr handle{nullptr}; + std::unique_ptr> buffer{nullptr}; + std::shared_ptr insert_cxt{nullptr}; +}; +} diff --git a/src/Storages/NexusFS/NexusFSIndex.cpp b/src/Storages/NexusFS/NexusFSIndex.cpp deleted file mode 100644 index bf44a6cf1c..0000000000 --- a/src/Storages/NexusFS/NexusFSIndex.cpp +++ /dev/null @@ -1,208 +0,0 @@ -#include - -#include - -#include - -#include -#include -#include "Storages/DiskCache/Types.h" - -namespace DB::ErrorCodes -{ -extern const int INVALID_CONFIG_PARAMETER; -} - -namespace DB::NexusFSComponents -{ -namespace -{ - UInt8 safeInc(UInt8 val) - { - if (val < std::numeric_limits::max()) - return val + 1; - return val; - } -} - -void NexusFSIndex::setHits(UInt64 key, UInt8 current_hits, UInt8 total_hits) -{ - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - it.value().current_hits = current_hits; - it.value().total_hits = total_hits; - } -} - -NexusFSIndex::LookupResult NexusFSIndex::lookup(UInt64 key) -{ - LookupResult result; - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - it.value().total_hits = safeInc(result.record.total_hits); - it.value().current_hits = safeInc(result.record.current_hits); - } - return result; -} - -NexusFSIndex::LookupResult NexusFSIndex::peek(UInt64 key) const -{ - LookupResult result; - const auto & map = getMap(key); - auto lock = std::shared_lock{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - } - return result; -} - -NexusFSIndex::LookupResult NexusFSIndex::insert(UInt64 key, RelAddress address, UInt32 size) -{ - LookupResult result; - auto & map = getMap(key); - // auto handle = std::make_shared(); - // handle->loadedToDisk(address); - - auto guard = std::lock_guard{getMutex(key)}; - auto ret = map.try_emplace(subkey(key), address, size); - chassert(ret.second); - result.found = true; - result.record = ret.first->second; - - return result; -} - -// bool NexusFSIndex::replaceIfMatch(UInt64 key, RelAddress new_address, RelAddress old_address) -// { -// auto & map = getMap(key); -// auto guard = std::lock_guard{getMutex(key)}; - -// auto it = map.find(subkey(key)); -// if (it != map.end() && it->second.address == old_address) -// { -// it.value().address = new_address; -// it.value().current_hits = 0; -// return true; -// } -// return false; -// } - -void NexusFSIndex::trackRemove(UInt8 total_hits) -{ - if (total_hits == 0) - unaccessed_items++; -} - -NexusFSIndex::LookupResult NexusFSIndex::remove(UInt64 key) -{ - LookupResult result; - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end()) - { - result.found = true; - result.record = it->second; - - trackRemove(it->second.total_hits); - map.erase(it); - } - return result; -} - -bool NexusFSIndex::removeIfMatch(UInt64 key, RelAddress address) -{ - auto & map = getMap(key); - auto guard = std::lock_guard{getMutex(key)}; - - auto it = map.find(subkey(key)); - if (it != map.end() && it->second.address == address) - { - trackRemove(it->second.total_hits); - map.erase(it); - return true; - } - return false; -} - -void NexusFSIndex::reset() -{ - for (UInt32 i = 0; i < kNumBuckets; i++) - { - auto guard = std::lock_guard{getMutexOfBucket(i)}; - buckets[i].clear(); - } - unaccessed_items = 0; -} - -size_t NexusFSIndex::compuiteSize() const -{ - size_t size = 0; - for (UInt32 i = 0; i < kNumBuckets; i++) - { - auto guard = std::lock_guard{getMutexOfBucket(i)}; - size += buckets[i].size(); - } - return size; -} - -void NexusFSIndex::persist(google::protobuf::io::CodedOutputStream * stream) const -{ - Protos::NexusFSIndexBucket bucket; - for (UInt32 i = 0; i < kNumBuckets; i++) - { - bucket.set_bucket_id(i); - for (const auto & [key, v] : buckets[i]) - { - auto * entry = bucket.add_entries(); - entry->set_key(key); - entry->set_address_rid(v.address.rid().index()); - entry->set_address_offset(v.address.offset()); - entry->set_size(v.size); - entry->set_total_hits(v.total_hits); - entry->set_current_hits(v.current_hits); - } - - google::protobuf::util::SerializeDelimitedToCodedStream(bucket, stream); - bucket.clear_entries(); - } -} - -void NexusFSIndex::recover(google::protobuf::io::CodedInputStream * stream) -{ - for (UInt32 i = 0; i < kNumBuckets; i++) - { - Protos::NexusFSIndexBucket bucket; - google::protobuf::util::ParseDelimitedFromCodedStream(&bucket, stream, nullptr); - UInt32 id = bucket.bucket_id(); - if (id >= kNumBuckets) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "Invalid bucket id. Max buckets: {}, bucket id: {}", kNumBuckets, id); - - for (const auto & entry : bucket.entries()) - { - buckets[id].try_emplace( - entry.key(), - RelAddress(HybridCache::RegionId(entry.address_rid()), entry.address_offset()), - static_cast(entry.size()), - static_cast(entry.total_hits()), - static_cast(entry.current_hits())); - } - } -} - -} diff --git a/src/Storages/NexusFS/NexusFSIndex.h b/src/Storages/NexusFS/NexusFSIndex.h deleted file mode 100644 index 7da3e197d1..0000000000 --- a/src/Storages/NexusFS/NexusFSIndex.h +++ /dev/null @@ -1,180 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include "IO/BufferWithOwnMemory.h" -#include "QueryPlan/Void.h" -#include "Storages/DiskCache/Buffer.h" -#include "Storages/DiskCache/Types.h" - - - -namespace ProfileEvents -{ - extern const Event NexusFSMemoryBufferHit; - extern const Event NexusFSMemoryBufferMiss; - extern const Event NexusFSMemoryBufferEvict; -} - -namespace DB::NexusFSComponents -{ -using HybridCache::RelAddress; - -class NexusFSIndex -{ -public: - using SharedMutex = folly::fibers::TimedRWMutexWritePriority; - - NexusFSIndex() = default; - NexusFSIndex(const NexusFSIndex &) = delete; - NexusFSIndex & operator=(const NexusFSIndex &) = delete; - - void persist(google::protobuf::io::CodedOutputStream * stream) const; - void recover(google::protobuf::io::CodedInputStream * stream); - - struct PACKED_LINLINE ItemRecord - { - // address in device - // std::shared_ptr handle; - RelAddress address{RelAddress()}; - // item size - UInt32 size{0}; - // total hits during this item's entire lifetime in cache - UInt8 total_hits{0}; - // hits during the current window for this item - UInt8 current_hits{0}; - - // explicit ItemRecord(std::shared_ptr handle_ = nullptr, UInt32 size_ = 0, UInt8 total_hits_ = 0, UInt8 current_hits_ = 0) - // : handle(handle_), size(size_), total_hits(total_hits_), current_hits(current_hits_) - // { - // } - explicit ItemRecord(RelAddress address_ = RelAddress(), UInt32 size_ = 0, UInt8 total_hits_ = 0, UInt8 current_hits_ = 0) - : address(address_), size(size_), total_hits(total_hits_), current_hits(current_hits_) - { - } - }; - // static_assert(14 == sizeof(ItemRecord), "ItemRecord size is 14 bytes"); - - struct LookupResult - { - friend class NexusFSIndex; - - bool isFound() const { return found; } - - ItemRecord getRecord() const - { - chassert(found); - return record; - } - - // std::shared_ptr getHandler() const - // { - // chassert(found); - // return record.handle; - // } - - RelAddress getAddress() const - { - chassert(found); - return record.address; - } - - UInt32 getSize() const - { - chassert(found); - return record.size; - } - - UInt8 getTotalHits() const - { - chassert(found); - return record.total_hits; - } - - UInt8 getCurrentHits() const - { - chassert(found); - return record.current_hits; - } - - private: - ItemRecord record; - bool found{false}; - }; - - // Gets value and update tracking counters - LookupResult lookup(UInt64 key); - - // Gets value without updating tracking counters - LookupResult peek(UInt64 key) const; - - // Overwrites existing key if exists with new address adn size. If the entry was successfully overwritten, - // LookupResult returns . - LookupResult insert(UInt64 key, RelAddress address, UInt32 size); - - // // Replaces old address with new address if there exists the key with the identical old address. - // bool replaceIfMatch(UInt64 key, RelAddress new_address, RelAddress old_address); - - // If the entry was successfully removed, LookupResult returns . - LookupResult remove(UInt64 key); - - // Removes only if both key and address match. - bool removeIfMatch(UInt64 key, RelAddress address); - - // Update hits information of a key. - void setHits(UInt64 key, UInt8 current_hits, UInt8 total_hits); - - // Resets all the buckets to the initial state. - void reset(); - - // Walks buckets and computes total index entry count - size_t compuiteSize() const; - -private: - static constexpr UInt32 kNumBuckets{64 * 1024}; - static constexpr UInt32 kNumMutexes{1024}; - - using Map = tsl::sparse_map; - - static UInt32 bucket(UInt64 hash) { return (hash >> 32) & (kNumBuckets - 1); } - - static UInt32 subkey(UInt64 hash) { return hash & 0xffffffffu; } - - SharedMutex & getMutexOfBucket(UInt32 bucket) const - { - chassert(isPowerOf2(kNumMutexes)); - return mutex[bucket & (kNumMutexes - 1)]; - } - - SharedMutex & getMutex(UInt64 hash) const - { - auto b = bucket(hash); - return getMutexOfBucket(b); - } - - Map & getMap(UInt64 hash) const - { - auto b = bucket(hash); - return buckets[b]; - } - - void trackRemove(UInt8 total_hits); - - mutable Poco::AtomicCounter unaccessed_items; - - std::unique_ptr mutex{new SharedMutex[kNumMutexes]}; - std::unique_ptr buckets{new Map[kNumBuckets]}; -}; - -} diff --git a/src/Storages/NexusFS/NexusFSInodeManager.cpp b/src/Storages/NexusFS/NexusFSInodeManager.cpp new file mode 100644 index 0000000000..6d8abd1ee1 --- /dev/null +++ b/src/Storages/NexusFS/NexusFSInodeManager.cpp @@ -0,0 +1,392 @@ +#include + +#include +#include + +#include + +#include +#include "common/defines.h" +#include "common/logger_useful.h" +#include "common/types.h" +#include +#include +#include "IO/WriteHelpers.h" + +namespace ProfileEvents +{ +extern const Event NexusFSInodeManagerLookupMicroseconds; +extern const Event NexusFSInodeManagerInsertMicroseconds; +} + +namespace DB::ErrorCodes +{ +extern const int INVALID_CONFIG_PARAMETER; +extern const int CANNOT_OPEN_FILE; +} + +namespace DB::NexusFSComponents +{ + +std::shared_ptr FileMeta::getHandle(UInt64 segment_id) +{ + std::lock_guard l(mutex); + if (segment_id >= segments.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta::getHandle for segment_id {} out of bound", segment_id); + if (segments[segment_id] && !segments[segment_id]->isRelAddressValid()) + segments[segment_id].reset(); + return segments[segment_id]; +} + +void FileMeta::setHandle(UInt64 segment_id, std::shared_ptr & handle) +{ + std::lock_guard l(mutex); + if (segment_id >= segments.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta::setHandle for segment_id {} out of bound", segment_id); + segments[segment_id] = handle; +} + +void FileMeta::toProto(Protos::NexusFSFileMeta * proto) +{ + std::lock_guard l(mutex); + proto->set_file_size(file_size); + for (size_t i = 0; i < segments.size(); i++) + { + const auto & handle = segments[i]; + if (handle) + { + Protos::NexusFSFileSegment * proto_handle = proto->add_segments(); + proto_handle->set_segment_id(i); + proto_handle->set_address_rid(handle->getRelAddress().rid().index()); + proto_handle->set_address_offset(handle->getRelAddress().offset()); + proto_handle->set_size(handle->getSize()); + } + } +} + +bool FileMeta::canBeRemoved() +{ + bool has_valid_handle = false; + std::lock_guard l(mutex); + for (auto & segment : segments) + { + if (segment) + { + if (segment->isRelAddressValid()) + has_valid_handle = true; + else + segment.reset(); + } + } + return has_valid_handle; +} + +std::pair FileMeta::getCachedSizeAndSegments() +{ + std::lock_guard l(mutex); + UInt64 cached_segments = 0; + UInt64 cached_size = 0; + for (auto & segment : segments) + { + if (segment) + { + if (segment->isRelAddressValid()) + { + cached_segments++; + cached_size += segment->getSize(); + } + else + segment.reset(); + } + } + return {cached_size, cached_segments}; +} + + +std::shared_ptr Inode::getHandle(String & file, UInt64 segment_id) +{ + auto it = files.find(file); + if (it == files.end()) + return nullptr; + + const auto & meta = it->second; + if (!meta) + return nullptr; + + return meta->getHandle(segment_id); +} + +void Inode::setHandle( + const String & file, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size, + std::atomic & num_file_metas) +{ + auto it = files.find(file); + if (it == files.end()) + { + auto [file_size, segment_size] = get_file_and_segment_size(); + auto meta = std::make_shared(file_size, segment_size); + it = files.try_emplace(file, meta).first; + num_file_metas++; + } + + const auto & meta = it->second; + if (!meta) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta for file {} not found", file); + + meta->setHandle(segment_id, handle); +} + +void Inode::setHandle(const String & file, std::shared_ptr & file_meta) +{ + if (!files.try_emplace(file, file_meta).second) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "FileMeta for file {} already exists", file); +} + +void Inode::cleanInvalidFiles(std::atomic & num_file_metas) +{ + for (auto [file_name, file_meta] : files) + { + if (file_meta->canBeRemoved()) + { + files.erase(file_name); + num_file_metas--; + } + } +} + +void Inode::toProto(Protos::NexusFSInode * node) +{ + node->set_node_id(id); + for (const auto & [key, meta] : files) + { + Protos::NexusFSFileMeta * file = node->add_files(); + file->set_file_name(key); + meta->toProto(file); + } +} + +void Inode::getFileCachedStates(std::vector & result) +{ + for (const auto & [file_name, file_meta] : files) + { + FileCachedState state; + auto [total_size, total_segment] = file_meta->getTotalSizeAndSegments(); + auto [cached_size, cached_segment] = file_meta->getTotalSizeAndSegments(); + result.emplace_back(FileCachedState{ + .file_path = file_name, + .total_segments = total_segment, + .cached_segments = cached_segment, + .total_size = total_size, + .cached_size = cached_size}); + } +} + + +std::shared_ptr InodeManager::lookup(const String & path, UInt64 segment_id) const +{ + // TODO: increase hits + return peek(path, segment_id); +} + +std::shared_ptr InodeManager::peek(const String & path, UInt64 segment_id) const +{ + ProfileEventTimeIncrement watch(ProfileEvents::NexusFSInodeManagerLookupMicroseconds); + + std::vector dirs; + resolvePath(path, dirs); + chassert(!dirs.empty()); + + String file = dirs.back(); + dirs.pop_back(); + + UInt64 pid = 0; + auto it = inodes.end(); + for (auto & dir : dirs) + { + String pid_dir = toString(pid) + "/" + dir; + it = inodes.find(pid_dir); + if (it == inodes.end()) + { + return nullptr; + } + else + { + pid = it->second->getId(); + } + } + + const auto & inode = dirs.empty() ? root_inode : it->second; + return inode->getHandle(file, segment_id); +} + +void InodeManager::insert( + const String & path, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size) +{ + ProfileEventTimeIncrement watch(ProfileEvents::NexusFSInodeManagerInsertMicroseconds); + + std::vector dirs; + resolvePath(path, dirs); + + String file = dirs.back(); + dirs.pop_back(); + + UInt64 pid = 0; + auto it = inodes.end(); + for (const auto & dir : dirs) + { + String pid_dir = toString(pid) + "/" + dir; + it = inodes.find(pid_dir); + if (it == inodes.end()) + { + it = inodes.try_emplace(pid_dir, std::make_shared(inode_id.fetch_add(1))).first; + num_inodes++; + } + pid = it->second->getId(); + } + + const auto & inode = dirs.empty() ? root_inode : it->second; + inode->setHandle(file, segment_id, handle, get_file_and_segment_size, num_file_metas); +} + +void InodeManager::reset() +{ + inodes.clear(); + num_inodes = 1; +} + +void InodeManager::persist(google::protobuf::io::CodedOutputStream * stream) const +{ + Protos::NexusFSInodeManager manager; + manager.set_prefix(prefix); + manager.set_surfix(surfix); + auto * root_inode_proto = manager.mutable_root_inode(); + root_inode_proto->set_node_key(""); + root_inode->toProto(root_inode_proto); + for (const auto & [key, val] : inodes) + { + auto * node = manager.add_inodes(); + node->set_node_key(key); + val->toProto(node); + } + google::protobuf::util::SerializeDelimitedToCodedStream(manager, stream); +} + +void InodeManager::recover( + google::protobuf::io::CodedInputStream * stream, HybridCache::RegionManager & region_manager, std::atomic & num_segments) +{ + Protos::NexusFSInodeManager manager; + google::protobuf::util::ParseDelimitedFromCodedStream(&manager, stream, nullptr); + + if (manager.prefix() != prefix || manager.surfix() != surfix) + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "Invalid prefix or surfix. Expected prefix: {}, surfix: {}, actual prefix: {}, surfix: {}", + prefix, + surfix, + manager.prefix(), + manager.surfix()); + + auto recover_files_in_inode = [&](std::shared_ptr & node, const Protos::NexusFSInode & proto_node) + { + for (const auto & proto_file : proto_node.files()) + { + auto file = std::make_shared(proto_file.file_size(), segment_size); + num_file_metas++; + for (const auto & proto_seg : proto_file.segments()) + { + auto rid = HybridCache::RegionId(proto_seg.address_rid()); + auto addr = RelAddress(rid, proto_seg.address_offset()); + auto handle = std::make_shared(addr, proto_seg.size()); + file->setHandle(proto_seg.segment_id(), handle); + region_manager.getRegion(rid).addHandle(handle); + num_segments++; + } + node->setHandle(proto_file.file_name(), file); + } + }; + + recover_files_in_inode(root_inode, manager.root_inode()); + for (const auto & proto_node : manager.inodes()) + { + auto inode = std::make_shared(proto_node.node_id()); + num_inodes++; + recover_files_in_inode(inode, proto_node); + inodes.emplace(proto_node.node_key(), inode); + } +} + + +String InodeManager::extractValidPath(const String & path) const +{ + if (path.size() <= prefix.size() + surfix.size()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} invalid, its length is smaller than prefix + surfix", path); + if (prefix != path.substr(0, prefix.size())) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} has invalid prefix, required prefix should be {}", path, prefix); + if (surfix != path.substr(path.size() - surfix.size(), surfix.size())) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} has invalid surfix, required surfix should be {}", path, surfix); + + String valid_path = path.substr(prefix.size(), path.size() - prefix.size() - surfix.size()); + if (valid_path.empty()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} is invalid, it consists of only prefix and suffix. ", path); + + return valid_path; +} + + +void InodeManager::resolvePath(const String & path, std::vector & ressolved_dirs) const +{ + String valid_path = extractValidPath(path); + + String dir; + for (auto ch : valid_path) + { + if (ch == '/') + { + if (!dir.empty()) + { + ressolved_dirs.push_back(dir); + dir.clear(); + } + } + else + { + dir.push_back(ch); + } + } + if (!dir.empty()) + ressolved_dirs.push_back(dir); + + if (ressolved_dirs.empty()) + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "path {} is invalid, it consists of only prefix and suffix. ", path); + + { + String str = ressolved_dirs[0]; + for (size_t i = 1; i < ressolved_dirs.size(); i++) + { + str += "," + ressolved_dirs[i]; + } + LOG_TRACE(log, "resolvePath get: {}", str); + } +} + +void InodeManager::cleanInvalidFiles() +{ + for (auto [_, node] : inodes) + node->cleanInvalidFiles(num_file_metas); +} + +std::vector InodeManager::getFileCachedStates() +{ + std::vector ret; + root_inode->getFileCachedStates(ret); + for (auto [dir_name, node] : inodes) + node->getFileCachedStates(ret); + return ret; +} +} diff --git a/src/Storages/NexusFS/NexusFSInodeManager.h b/src/Storages/NexusFS/NexusFSInodeManager.h new file mode 100644 index 0000000000..23032a9cad --- /dev/null +++ b/src/Storages/NexusFS/NexusFSInodeManager.h @@ -0,0 +1,151 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include "IO/BufferWithOwnMemory.h" +#include "Storages/DiskCache/Buffer.h" +#include "Storages/DiskCache/Types.h" +#include "Storages/NexusFS/NexusFSBuffer.h" + +#include + + +namespace DB::NexusFSComponents +{ + +struct FileCachedState +{ + String file_path; + UInt64 total_segments; + UInt64 cached_segments; + UInt64 total_size; + UInt64 cached_size; +}; + +class FileMeta +{ +public: + explicit FileMeta(size_t file_size_, UInt32 segment_size) : file_size(file_size_) + { + chassert(segment_size > 0); + size_t segments_count = (file_size + segment_size - 1) / segment_size; + segments.resize(segments_count); + } + FileMeta(const FileMeta &) = delete; + FileMeta & operator=(const FileMeta &) = delete; + + size_t getFileSize() const { return file_size; } + std::pair getTotalSizeAndSegments() const { return {file_size, segments.size()}; } + std::pair getCachedSizeAndSegments(); + + std::shared_ptr getHandle(UInt64 segment_id); + void setHandle(UInt64 segment_id, std::shared_ptr & handle); + + void toProto(Protos::NexusFSFileMeta * proto); + + bool canBeRemoved(); + +private: + folly::fibers::TimedMutex mutex; + const size_t file_size; + std::vector> segments; +}; + +class Inode +{ +public: + explicit Inode(UInt64 id_) : id(id_) { } + Inode(const Inode &) = delete; + Inode & operator=(const Inode &) = delete; + + UInt64 getId() const { return id; } + + std::shared_ptr getHandle(String & file, UInt64 segment_id); + void setHandle( + const String & file, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size, + std::atomic & num_file_metas); + void setHandle(const String & file, std::shared_ptr & file_meta); + + void toProto(Protos::NexusFSInode * node); + + void cleanInvalidFiles(std::atomic & num_file_metas); + + void getFileCachedStates(std::vector & result); + +private: + UInt64 id; + folly::ConcurrentHashMap> files; +}; + +class InodeManager +{ +public: + explicit InodeManager(const String & prefix_, const String & surfix_, const UInt32 segment_size_) + : prefix(prefix_), surfix(surfix_), segment_size(segment_size_), root_inode(std::make_shared(0)) + { + num_inodes++; + } + InodeManager(const InodeManager &) = delete; + InodeManager & operator=(const InodeManager &) = delete; + + + void persist(google::protobuf::io::CodedOutputStream * stream) const; + void recover( + google::protobuf::io::CodedInputStream * stream, HybridCache::RegionManager & region_manager, std::atomic & num_segments); + + // Gets value and update tracking counters + std::shared_ptr lookup(const String & path, UInt64 segment_id) const; + + // Gets value without updating tracking counters + std::shared_ptr peek(const String & path, UInt64 segment_id) const; + + // Overwrites existing key if exists with new address adn size. If the entry was successfully overwritten, + // LookupResult returns . + void insert( + const String & path, + UInt64 segment_id, + std::shared_ptr & handle, + const std::function()> & get_file_and_segment_size); + + // Resets all the buckets to the initial state. + void reset(); + + void cleanInvalidFiles(); + + UInt64 getNumInodes() const { return num_inodes.load(); } + UInt64 getNumFileMetas() const { return num_file_metas.load(); } + + std::vector getFileCachedStates(); + +private: + String extractValidPath(const String & path) const; + void resolvePath(const String & path, std::vector & ressolved_dirs) const; + + LoggerPtr log = getLogger("NexusFSInodeManager"); + + const String prefix; + const String surfix; + const UInt32 segment_size; + std::atomic inode_id{1}; + std::shared_ptr root_inode; + folly::ConcurrentHashMap> inodes; + + std::atomic num_inodes{0}; + std::atomic num_file_metas{0}; +}; + +} diff --git a/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp b/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp new file mode 100644 index 0000000000..f871be3c30 --- /dev/null +++ b/src/Storages/NexusFS/tests/gtest_inode_manager_test.cpp @@ -0,0 +1,177 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace DB::NexusFSComponents +{ + +using namespace HybridCache; + +TEST(NexusFSInodeManager, GetAndSet) +{ + auto get_file_and_segment_size = []() { return std::make_pair(5, 1); }; + InodeManager index("/prefix/", "/data", 1); + auto h1 = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert("/prefix//AA/BB/CC/data", 0, h1, get_file_and_segment_size); + auto h2 = index.lookup("/prefix/AA/BB//CC//data", 0); + EXPECT_EQ(h1.get(), h2.get()); + EXPECT_TRUE(index.lookup("/prefix/AA/BB/CC/data", 0)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CC/data", 1)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CC/data", 1)); + EXPECT_FALSE(index.lookup("/prefix/AA/BB/CCX/data", 0)); + EXPECT_FALSE(index.lookup("/prefix/AA/AA/BB/CC/data", 0)); + + EXPECT_FALSE(index.lookup("/prefix/AA/BB/BB/data", 1)); + auto h3 = std::make_shared(RelAddress(RegionId(2), 3), 4); + index.insert("/prefix/AA/BB/BB/data", 1, h3, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/AA/BB/BB/data", 1)); + + EXPECT_FALSE(index.lookup("/prefix/0/data", 1)); + auto h4 = std::make_shared(RelAddress(RegionId(5), 3), 4); + index.insert("/prefix/0/data", 1, h4, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/0/data", 1)); +} + +TEST(NexusFSInodeManager, InvalidPath) +{ + InodeManager index("/prefix/", "/data", 128); + EXPECT_THROW({ index.lookup("", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix1/AA/BB/CC/data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/BB/CC/data2", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix//data", 0); }, Exception); + EXPECT_THROW({ index.lookup("/prefix///data", 0); }, Exception); +} + +TEST(NexusFSInodeManager, InvalidSegmentId) +{ + auto get_file_and_segment_size = []() { return std::make_pair(5, 1); }; + InodeManager index("/prefix/", "/data", 1); + EXPECT_FALSE(index.lookup("/prefix/AA/data", 1)); + auto h1 = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert("/prefix/AA/data", 1, h1, get_file_and_segment_size); + EXPECT_TRUE(index.lookup("/prefix/AA/data", 1)); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 5); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 6); }, Exception); + EXPECT_THROW({ index.lookup("/prefix/AA/data", 999); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 5, h1, get_file_and_segment_size); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 6, h1, get_file_and_segment_size); }, Exception); + EXPECT_THROW({ index.insert("/prefix/AA/data", 999, h1, get_file_and_segment_size); }, Exception); +} + +TEST(NexusFSInodeManager, ThreadSafe) +{ + auto get_file_and_segment_size = []() { return std::make_pair(20, 1); }; + InodeManager index("/prefix/", "/data", 128); + const String file = "/prefix/AA/BB/CC/DD/EE/data"; + auto handle = std::make_shared(RelAddress(RegionId(1), 2), 3); + index.insert(file, 10, handle, get_file_and_segment_size); + + auto lookup = [&]() { index.lookup(file, 10); }; + + std::vector threads; + threads.reserve(200); + for (int i = 0; i < 200; i++) + { + threads.emplace_back(lookup); + } + + for (auto & t : threads) + { + t.join(); + } + + // TODO: hits + // EXPECT_EQ(200, index.peek(key).getTotalHits()); + // EXPECT_EQ(200, index.peek(key).getCurrentHits()); +} + +TEST(NexusFSInodeManager, Recovery) +{ + auto get_file_and_segment_size = []() { return std::make_pair(10, 1); }; + InodeManager index("/prefix/", "/data", 1); + std::vector> log; + for (UInt64 i = 0; i < 16; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/123/" + toString(i) + "/data"; + auto handle = std::make_shared(RelAddress(RegionId(i), j), 1); + index.insert(file, j, handle, get_file_and_segment_size); + log.emplace_back(file, j); + } + } + for (UInt64 i = 16; i < 20; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/" + toString(i) + "/data"; + auto handle = std::make_shared(RelAddress(RegionId(i), j), 1); + index.insert(file, j, handle, get_file_and_segment_size); + log.emplace_back(file, j); + } + } + { + auto files = index.getFileCachedStates(); + EXPECT_EQ(20, files.size()); + for (auto & file : files) + { + EXPECT_EQ(10, file.cached_size); + EXPECT_EQ(10, file.total_size); + EXPECT_EQ(10, file.cached_segments); + EXPECT_EQ(10, file.total_segments); + } + } + + Buffer metadata(INT_MAX); + + { + google::protobuf::io::ArrayOutputStream raw_stream(metadata.data(), INT_MAX); + google::protobuf::io::CodedOutputStream ostream(&raw_stream); + + index.persist(&ostream); + } + + auto device = createMemoryDevice(4096 * 20, 4096); + auto policy = std::make_unique(); + RegionManager region_manager(20, 4096, 0, *device, 1, 1, {}, {}, std::move(policy), 2, 4, 10); + std::atomic num_segments = 0; + InodeManager new_index("/prefix/", "/data", 1); + google::protobuf::io::ArrayInputStream raw_stream(metadata.data(), INT_MAX); + google::protobuf::io::CodedInputStream istream(&raw_stream); + new_index.recover(&istream, region_manager, num_segments); + for (auto & entry : log) + { + EXPECT_TRUE(new_index.lookup(entry.first, entry.second)); + } + for (UInt64 i = 20; i < 24; i++) + { + for (UInt64 j = 0; j < 10; j++) + { + String file = "/prefix/123/" + toString(i) + "/data"; + EXPECT_FALSE(new_index.lookup(file, j)); + } + } + { + auto files = index.getFileCachedStates(); + EXPECT_EQ(20, files.size()); + for (auto & file : files) + { + EXPECT_EQ(10, file.cached_size); + EXPECT_EQ(10, file.total_size); + EXPECT_EQ(10, file.cached_segments); + EXPECT_EQ(10, file.total_segments); + } + } +} + +} diff --git a/src/Storages/System/StorageSystemNexusFS.cpp b/src/Storages/System/StorageSystemNexusFS.cpp new file mode 100644 index 0000000000..c3f535e9ec --- /dev/null +++ b/src/Storages/System/StorageSystemNexusFS.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemNexusFS::getNamesAndTypes() +{ + return { + {"sub_file_path", std::make_shared()}, + {"total_size", std::make_shared()}, + {"cached_size", std::make_shared()}, + {"total_segments", std::make_shared()}, + {"cached_segments", std::make_shared()}, + }; +} + + +StorageSystemNexusFS::StorageSystemNexusFS(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +void StorageSystemNexusFS::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + auto nexus_fs = context->getNexusFS(); + if (!nexus_fs) + return; + auto files = nexus_fs->getFileCachedStates(); + for (const auto & file : files) + { + res_columns[0]->insert(file.file_path); + res_columns[1]->insert(file.total_size); + res_columns[2]->insert(file.cached_size); + res_columns[3]->insert(file.total_segments); + res_columns[4]->insert(file.cached_segments); + } +} + +} diff --git a/src/Storages/System/StorageSystemNexusFS.h b/src/Storages/System/StorageSystemNexusFS.h new file mode 100644 index 0000000000..8bd491456c --- /dev/null +++ b/src/Storages/System/StorageSystemNexusFS.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class NexusFS; +class Context; + + +/** Implements system table asynchronous_metrics, which allows to get values of periodically (asynchronously) updated metrics. + */ +class StorageSystemNexusFS final : public shared_ptr_helper, + public IStorageSystemOneBlock +{ + friend struct shared_ptr_helper; +public: + std::string getName() const override { return "SystemNexusFS"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: +#if defined(ARCADIA_BUILD) + StorageSystemNexusFS(const String & name_,) + : StorageSystemNexusFS(StorageID{"system", name_}) + { + } +#endif + explicit StorageSystemNexusFS(const StorageID & table_id_); + + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index f24fda1408..b46ef12ffc 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -226,6 +227,7 @@ void attachSystemTablesLocal(IDatabase & system_database) #endif attach(system_database, "io_schedulers"); attach(system_database, "io_workers"); + attach(system_database, "nexus_fs"); } diff --git a/tests/queries/4_cnch_stateless/00474_readonly_settings.sh b/tests/queries/4_cnch_stateless/00474_readonly_settings.sh index ed09fd5ccf..225882b106 100755 --- a/tests/queries/4_cnch_stateless/00474_readonly_settings.sh +++ b/tests/queries/4_cnch_stateless/00474_readonly_settings.sh @@ -22,6 +22,7 @@ RAW_URL=`echo "${RAW_URL}" | sed "s@enable_optimizer_fallback=1&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@enable_optimizer_fallback=0&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@bsp_mode=1&@@g"` RAW_URL=`echo "${RAW_URL}" | sed "s@tenant_id=1234&@@g"` +RAW_URL=`echo "${RAW_URL}" | sed "s@enable_nexus_fs=1&@@g"` ${CLICKHOUSE_CURL} -sS "${RAW_URL}&session_id=readonly&session_timeout=3600" -d 'SET readonly = 1' ${CLICKHOUSE_CURL} -sS "${RAW_URL}&session_id=readonly&query=SELECT+toUInt64(pow(2,+63))+as+value+format+JSON&output_format_json_quote_64bit_integers=1" 2>&1 | grep -o "value\|Cannot modify 'output_format_json_quote_64bit_integers' setting in readonly mode" diff --git a/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql b/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql index ee315bf5a5..f57eccc8e6 100644 --- a/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql +++ b/tests/queries/4_cnch_stateless/11009_alter_disk_cache.sql @@ -1,7 +1,7 @@ USE test; set bsp_max_retry_num=0; -- disable bsp retry DROP TABLE IF EXISTS at_dc; -CREATE TABLE at_dc(a UInt32, p UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p; +CREATE TABLE at_dc(a UInt32, p UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p SETTINGS enable_nexus_fs = 0; INSERT INTO at_dc VALUES (1, 1), (2, 1), (3, 1); INSERT INTO at_dc VALUES (4, 2), (5, 2), (6, 2); @@ -23,7 +23,7 @@ SELECT a FROM at_dc WHERE p = 1 ORDER BY a SETTINGS disk_cache_mode = 'FORCE_DIS DROP TABLE at_dc; DROP TABLE IF EXISTS test_bucket_preload; -CREATE TABLE test_bucket_preload(a UInt32, p UInt32, c UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p CLUSTER BY c INTO 3 BUCKETS SETTINGS parts_preload_level = 1; +CREATE TABLE test_bucket_preload(a UInt32, p UInt32, c UInt32) ENGINE = CnchMergeTree ORDER BY a PARTITION BY p CLUSTER BY c INTO 3 BUCKETS SETTINGS parts_preload_level = 1, enable_nexus_fs = 0; INSERT INTO test_bucket_preload SELECT number, 1, number % 7 FROM numbers(10); SELECT '---bucket---'; @@ -39,7 +39,7 @@ DROP TABLE test_bucket_preload; DROP TABLE IF EXISTS 11009_alter_disk_cache; SELECT '---all segments stores in single compressed block---'; -CREATE TABLE 11009_alter_disk_cache (d Decimal(4, 3)) ENGINE = CnchMergeTree ORDER BY d SETTINGS index_granularity = 1, parts_preload_level = 1; +CREATE TABLE 11009_alter_disk_cache (d Decimal(4, 3)) ENGINE = CnchMergeTree ORDER BY d SETTINGS index_granularity = 1, parts_preload_level = 1, enable_nexus_fs = 0; INSERT INTO 11009_alter_disk_cache SELECT toDecimal64(number, 3) FROM numbers(10000); ALTER DISK CACHE PRELOAD TABLE test.11009_alter_disk_cache SYNC SETTINGS parts_preload_level = 3; SELECT d FROM 11009_alter_disk_cache WHERE toFloat64(d) = 7777.0 settings disk_cache_mode = 'FORCE_DISK_CACHE'; diff --git a/tests/queries/4_cnch_stateless/51001_index_col.sql b/tests/queries/4_cnch_stateless/51001_index_col.sql index 729dff7e0b..2172ea803f 100644 --- a/tests/queries/4_cnch_stateless/51001_index_col.sql +++ b/tests/queries/4_cnch_stateless/51001_index_col.sql @@ -13,7 +13,7 @@ create table test.multi_index_table ENGINE = CnchMergeTree PARTITION BY toStartOfInterval(ts, toIntervalHour(12)) ORDER BY ts -SETTINGS index_granularity = 8; +SETTINGS index_granularity = 8, enable_nexus_fs = 0; insert into table test.multi_index_table values ('2023-10-17 00:11:58.996', 'preload_test1', 'preload_test2', [1, 2, 3, 4, 5]) @@ -38,7 +38,7 @@ create table test.multi_index_table ENGINE = CnchMergeTree PARTITION BY toStartOfInterval(ts, toIntervalHour(12)) ORDER BY ts -SETTINGS index_granularity = 8; +SETTINGS index_granularity = 8, enable_nexus_fs = 0; insert into table test.multi_index_table values ('2022-10-17 00:11:58.996', 'preload_test1', 'preload_test2', [1, 2, 3, 4, 5])