From cc83899a516922c1ad407c628f03c22f9d6f2b3d Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 27 Jan 2025 16:30:13 -0300 Subject: [PATCH] Rearrange a few things and add new filters --- .../Feature/VectorEmbeddings/Indexable.php | 59 ++++++- .../VectorEmbeddings/Indexables/Post.php | 158 ++++++++++++++---- .../VectorEmbeddings/VectorEmbeddings.php | 104 ++++-------- 3 files changed, 212 insertions(+), 109 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index e96c184..304cb0a 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -11,6 +11,8 @@ namespace ElasticPressLabs\Feature\VectorEmbeddings; +use ElasticPress\Elasticsearch; + /** * Vector Embeddings Indexable abstract class */ @@ -31,6 +33,61 @@ public function __construct( VectorEmbeddings $feature ) { $this->feature = $feature; } + /** + * Add a vector field to the Elasticsearch mapping. + * + * @param array $mapping Current mapping. + * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @return array + */ + public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array { + $es_version = Elasticsearch::factory()->get_elasticsearch_version(); + + // Don't add the field if it already exists. + if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + return $mapping; + } + + // Add the default vector field mapping. + $mapping['mappings']['properties']['chunks'] = [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => $this->feature->get_dimensions(), + ], + ], + ]; + + // Add extra vector fields for newer versions of Elasticsearch. + if ( version_compare( $es_version, '8.0', '>=' ) ) { + // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields + // were added in 8.0. The similarity field must be set if index is true. + $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( + $mapping['mappings']['properties']['chunks']['properties']['vector'], + [ + 'index' => true, + 'similarity' => 'cosine', + ] + ); + + // The element_type field was added in 8.6. This can be either float (default) or byte. + if ( version_compare( $es_version, '8.6', '>=' ) ) { + $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; + } + + // The int8_hnsw type was added in 8.12. + if ( $quantization && version_compare( $es_version, '8.12', '>=' ) ) { + // This is supposed to result in better performance but slightly less accurate results. + // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. + // Can test with this on and off and compare results to see what works best. + $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; + } + } + + return $mapping; + } + /** * Add the embedding data to the post vector sync args. * @@ -38,7 +95,7 @@ public function __construct( VectorEmbeddings $feature ) { * @param array $embeddings The embeddings to add to the sync args * @return array */ - public function add_chuncks_field_value( array $args, array $embeddings ): array { + public function add_chunks_field_value( array $args, array $embeddings ): array { // If we still don't have embeddings, return early. if ( empty( $embeddings ) ) { return $args; diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index f48ae3d..ecc8844 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -37,7 +37,7 @@ public function setup() { * @return array */ public function add_post_vector_field_mapping( array $mapping ): array { - return $this->feature->add_vector_mapping_field( $mapping ); + return $this->add_vector_mapping_field( $mapping ); } /** @@ -66,7 +66,11 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra $post_chunks = $this->get_post_chunks( $post_id ); $embeddings = $this->feature->get_embedding( $post_id, 'post', $post_chunks ); - return $this->add_chuncks_field_value( $args, $embeddings ); + if ( ! is_array( $embeddings ) ) { + return $args; + } + + return $this->add_chunks_field_value( $args, $embeddings ); } /** @@ -109,26 +113,21 @@ public function get_post_chunks( int $post_id ): array { $main_content .= "# Content\n{$content}\n\n"; } - $chunks = $this->feature->chunk_content( $main_content ); + /** + * Filter the main content of a post before being split into chunks. + * + * @hook ep_openai_embeddings_post_main_content + * @since 2.4.0 + * + * @param {string} $main_content Title, excerpt, and content of a post. + * @param {\WP_Post} $post The post being processed. + * @return {string} The final main content representation. + */ + $main_content = apply_filters( 'ep_openai_embeddings_post_main_content', $main_content, $post ); - $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); - $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); - if ( empty( $weighting[ $post->post_type ] ) ) { - return $chunks; - } - - $post_type_weighting = $weighting[ $post->post_type ]; + $chunks = $this->feature->chunk_content( $main_content ); - $taxonomies = array_reduce( - array_keys( $post_type_weighting ), - function ( $acc, $field ) use ( $post_type_weighting ) { - if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) { - $acc[] = $matches[1]; - } - return $acc; - }, - [] - ); + $taxonomies = $this->get_embeddable_taxonomies( $post_id, $post->post_type ); if ( $taxonomies ) { $post_terms_str = $this->get_post_terms( $post, $taxonomies ); if ( $post_terms_str ) { @@ -136,24 +135,57 @@ function ( $acc, $field ) use ( $post_type_weighting ) { } } - $meta_fields = array_reduce( + $meta_fields = $this->get_embeddable_meta( $post_id, $post->post_type ); + if ( $meta_fields ) { + $post_meta_str = $this->get_post_meta( $post, $meta_fields ); + if ( $post_meta_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; + } + } + + return $chunks; + } + + /** + * Return the list of taxonomies that should be included in the post representation. + * + * @param integer $post_id The post ID. + * @param string $post_type The post type. + * @return array + */ + protected function get_embeddable_taxonomies( int $post_id, string $post_type ): array { + $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); + $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); + if ( empty( $weighting[ $post_type ] ) ) { + /** + * Filter the list of taxonomies which terms should be included in the post representation. + * + * @hook ep_openai_embeddings_post_embeddable_taxonomies + * @since 2.4.0 + * + * @param {array} $embeddable_taxonomies Array of taxonomy names. + * @param {int} $post_id The post ID. + * @param {string} $post_type The post type. + * @return {array} The list of taxonomy names. + */ + return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', [], $post_id, $post_type ); + } + + $post_type_weighting = $weighting[ $post_type ]; + + $taxonomies = array_reduce( array_keys( $post_type_weighting ), function ( $acc, $field ) use ( $post_type_weighting ) { - if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) { $acc[] = $matches[1]; } return $acc; }, [] ); - if ( $meta_fields ) { - $post_meta_str = $this->get_post_meta( $post, $meta_fields ); - if ( $post_meta_str ) { - $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; - } - } - return $chunks; + // This filter is documented above. + return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', $taxonomies, $post_id, $post_type ); } /** @@ -185,11 +217,63 @@ function ( $term ) { } } - return $post_terms_str; + /** + * Filter the string that represents the list of terms associated with this post. + * + * @hook ep_openai_embeddings_post_terms_str + * @since 2.4.0 + * + * @param {string} $post_terms_str String with post terms. + * @param {WP_Post} $post The post. + * @return {string} The string with post terms. + */ + return apply_filters( 'ep_openai_embeddings_post_terms_str', $post_terms_str, $post ); + } + + /** + * Return the list of metafields that should be included in the post representation. + * + * @param integer $post_id The post ID. + * @param string $post_type The post type. + * @return array + */ + protected function get_embeddable_meta( int $post_id, string $post_type ): array { + $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); + $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); + if ( empty( $weighting[ $post_type ] ) ) { + /** + * Filter the list of metafields which values should be included in the post representation. + * + * @hook ep_openai_embeddings_post_embeddable_meta + * @since 2.4.0 + * + * @param {array} $embeddable_meta Array of meta keys. + * @param {int} $post_id The post ID. + * @param {string} $post_type The post type. + * @return {array} The list of meta keys. + */ + return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', [], $post_id, $post_type ); + } + + $post_type_weighting = $weighting[ $post_type ]; + + $meta_fields = array_reduce( + array_keys( $post_type_weighting ), + function ( $acc, $field ) use ( $post_type_weighting ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) { + $acc[] = $matches[1]; + } + return $acc; + }, + [] + ); + + // This filter is documented above. + return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', $meta_fields, $post_id, $post_type ); } /** - * Get te representation of the post meta. + * Get the representation of the post meta. * * @param \WP_Post $post The post object * @param array $meta_fields List of metafields @@ -210,6 +294,16 @@ protected function get_post_meta( $post, $meta_fields ): string { } } - return $meta_str; + /** + * Filter the string that represents the meta fields associated with this post. + * + * @hook ep_openai_embeddings_post_meta_str + * @since 2.4.0 + * + * @param {string} $post_terms_str String with post terms. + * @param {WP_Post} $post The post. + * @return {string} The string with post terms. + */ + return apply_filters( 'ep_openai_embeddings_post_meta_str', $meta_str, $post ); } } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 58051b6..647e90e 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -25,13 +25,6 @@ * Vector Embeddings feature */ class VectorEmbeddings extends Feature { - /** - * Elasticsearch version. - * - * @var string $es_version - */ - protected $es_version; - /** * Number of dimensions for the embeddings. * @@ -67,8 +60,6 @@ public function __construct() { 'elasticpress-labs' ); - $this->es_version = Elasticsearch::factory()->get_elasticsearch_version(); - parent::__construct(); } @@ -91,7 +82,7 @@ public function requirements_status() { $status = new \ElasticPress\FeatureRequirementsStatus( 1 ); // Vector support was added in Elasticsearch 7.0. - if ( version_compare( $this->es_version, '7.0', '<=' ) ) { + if ( version_compare( Elasticsearch::factory()->get_elasticsearch_version(), '7.0', '<=' ) ) { $status->code = 2; $status->message = esc_html__( 'You need to have Elasticsearch with version >7.0.', 'elasticpress-labs' ); } @@ -150,79 +141,20 @@ public function set_settings_schema() { } /** - * Add a vector field to the Elasticsearch mapping. - * - * @param array $mapping Current mapping. - * @param bool $quantization Whether to use quantization for the vector field. Default false. - * @return array - */ - public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array { - // Don't add the field if it already exists. - if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { - return $mapping; - } - - // Add the default vector field mapping. - $mapping['mappings']['properties']['chunks'] = [ - 'type' => 'nested', - 'properties' => [ - 'vector' => [ - 'type' => 'dense_vector', - 'dims' => $this->get_dimensions(), - ], - ], - ]; - - // Add extra vector fields for newer versions of Elasticsearch. - if ( version_compare( $this->es_version, '8.0', '>=' ) ) { - // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields - // were added in 8.0. The similarity field must be set if index is true. - $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( - $mapping['mappings']['properties']['chunks']['properties']['vector'], - [ - 'index' => true, - 'similarity' => 'cosine', - ] - ); - - // The element_type field was added in 8.6. This can be either float (default) or byte. - if ( version_compare( $this->es_version, '8.6', '>=' ) ) { - $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; - } - - // The int8_hnsw type was added in 8.12. - if ( $quantization && version_compare( $this->es_version, '8.12', '>=' ) ) { - // This is supposed to result in better performance but slightly less accurate results. - // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. - // Can test with this on and off and compare results to see what works best. - $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; - } - } - - return $mapping; - } - - /** - * Get an embedding from a given text. + * Get an embedding from a given strings or array of strings. * * @param int $object_id The Object ID. * @param string $object_type The Object type. - * @param string|array $text Text or array of strings to get the embedding for. - * @param string $return_type Return type ('array' or 'raw'). Default 'array'. + * @param string|array $text String or array of strings to get the embedding for. * @return array|null|WP_Error */ - public function get_embedding( int $object_id, string $object_type, $text, string $return_type = 'array' ) { + public function get_embedding( int $object_id, string $object_type, $text ) { // Generate the embedding. if ( defined( 'WP_CLI' ) && WP_CLI ) { \WP_CLI::line( "Generating embedding for {$object_type} ID: {$object_id}" ); } - $embedding = $this->generate_embedding( $text ); - if ( is_wp_error( $embedding ) ) { - return 'raw' === $return_type ? $embedding : null; - } - - return $embedding; + return $this->generate_embedding( $text ); } /** @@ -295,10 +227,19 @@ public function generate_embedding( $text = '' ) { ) ); - error_log( 'generating embed' ); + /** + * Filter the response of the request. + * + * @hook ep_openai_embeddings_request_response + * @since 2.4.0 + * + * @param {array|WP_Error} $response The request response. + * @param {array|string} $text The text that was sent to be processed. + * @return {array|WP_Error} The request response. + */ + $response = apply_filters( 'ep_openai_embeddings_request_response', $response, $text ); if ( is_wp_error( $response ) ) { - error_log( print_r( $response, true ) ); return $response; } @@ -417,7 +358,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove // Iterate through & chunk data with an overlap. for ( $i = 0; $i < $text_count; $i += $chunk_size ) { // Join a set of words into a string. - $chunk = 'search_document: ' . implode( + $chunk = implode( ' ', array_slice( $words, @@ -426,6 +367,17 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove ) ); + /** + * Filter a chunk of text. + * + * @hook ep_openai_embeddings_chunk + * @since 2.4.0 + * + * @param {string} $chunk The chunk being processed. + * @return {string} The modified chunk. + */ + $chunk = apply_filters( 'ep_openai_embeddings_chunk', $chunk ); + array_push( $chunks, $chunk ); }