Skip to content

Commit

Permalink
Rearrange a few things and add new filters
Browse files Browse the repository at this point in the history
  • Loading branch information
felipeelia committed Jan 27, 2025
1 parent 69049ae commit cc83899
Show file tree
Hide file tree
Showing 3 changed files with 212 additions and 109 deletions.
59 changes: 58 additions & 1 deletion includes/classes/Feature/VectorEmbeddings/Indexable.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@

namespace ElasticPressLabs\Feature\VectorEmbeddings;

use ElasticPress\Elasticsearch;

/**
* Vector Embeddings Indexable abstract class
*/
Expand All @@ -31,14 +33,69 @@ public function __construct( VectorEmbeddings $feature ) {
$this->feature = $feature;
}

/**
* Add a vector field to the Elasticsearch mapping.
*
* @param array $mapping Current mapping.
* @param bool $quantization Whether to use quantization for the vector field. Default false.
* @return array
*/
public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array {
$es_version = Elasticsearch::factory()->get_elasticsearch_version();

// Don't add the field if it already exists.
if ( isset( $mapping['mappings']['properties']['chunks'] ) ) {
return $mapping;
}

// Add the default vector field mapping.
$mapping['mappings']['properties']['chunks'] = [
'type' => 'nested',
'properties' => [
'vector' => [
'type' => 'dense_vector',
'dims' => $this->feature->get_dimensions(),
],
],
];

// Add extra vector fields for newer versions of Elasticsearch.
if ( version_compare( $es_version, '8.0', '>=' ) ) {
// The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields
// were added in 8.0. The similarity field must be set if index is true.
$mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge(
$mapping['mappings']['properties']['chunks']['properties']['vector'],
[
'index' => true,
'similarity' => 'cosine',
]
);

// The element_type field was added in 8.6. This can be either float (default) or byte.
if ( version_compare( $es_version, '8.6', '>=' ) ) {
$mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float';
}

// The int8_hnsw type was added in 8.12.
if ( $quantization && version_compare( $es_version, '8.12', '>=' ) ) {
// This is supposed to result in better performance but slightly less accurate results.
// See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example.
// Can test with this on and off and compare results to see what works best.
$mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw';
}
}

return $mapping;
}

/**
* Add the embedding data to the post vector sync args.
*
* @param array $args The current sync args (an Elasticsearch document)
* @param array $embeddings The embeddings to add to the sync args
* @return array
*/
public function add_chuncks_field_value( array $args, array $embeddings ): array {
public function add_chunks_field_value( array $args, array $embeddings ): array {
// If we still don't have embeddings, return early.
if ( empty( $embeddings ) ) {
return $args;
Expand Down
158 changes: 126 additions & 32 deletions includes/classes/Feature/VectorEmbeddings/Indexables/Post.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public function setup() {
* @return array
*/
public function add_post_vector_field_mapping( array $mapping ): array {
return $this->feature->add_vector_mapping_field( $mapping );
return $this->add_vector_mapping_field( $mapping );
}

/**
Expand Down Expand Up @@ -66,7 +66,11 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra
$post_chunks = $this->get_post_chunks( $post_id );
$embeddings = $this->feature->get_embedding( $post_id, 'post', $post_chunks );

return $this->add_chuncks_field_value( $args, $embeddings );
if ( ! is_array( $embeddings ) ) {
return $args;
}

return $this->add_chunks_field_value( $args, $embeddings );
}

/**
Expand Down Expand Up @@ -109,51 +113,79 @@ public function get_post_chunks( int $post_id ): array {
$main_content .= "# Content\n{$content}\n\n";
}

$chunks = $this->feature->chunk_content( $main_content );
/**
* Filter the main content of a post before being split into chunks.
*
* @hook ep_openai_embeddings_post_main_content
* @since 2.4.0
*
* @param {string} $main_content Title, excerpt, and content of a post.
* @param {\WP_Post} $post The post being processed.
* @return {string} The final main content representation.
*/
$main_content = apply_filters( 'ep_openai_embeddings_post_main_content', $main_content, $post );

$search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' );
$weighting = $search_feature->weighting->get_weighting_configuration_with_defaults();
if ( empty( $weighting[ $post->post_type ] ) ) {
return $chunks;
}

$post_type_weighting = $weighting[ $post->post_type ];
$chunks = $this->feature->chunk_content( $main_content );

$taxonomies = array_reduce(
array_keys( $post_type_weighting ),
function ( $acc, $field ) use ( $post_type_weighting ) {
if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) {
$acc[] = $matches[1];
}
return $acc;
},
[]
);
$taxonomies = $this->get_embeddable_taxonomies( $post_id, $post->post_type );
if ( $taxonomies ) {
$post_terms_str = $this->get_post_terms( $post, $taxonomies );
if ( $post_terms_str ) {
$chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_terms_str ) ];

Check failure on line 134 in includes/classes/Feature/VectorEmbeddings/Indexables/Post.php

View workflow job for this annotation

GitHub Actions / PHP Lint

Array unpacking within array declarations using the spread operator is not supported in PHP 7.3 or earlier. Found: ...$chunks

Check failure on line 134 in includes/classes/Feature/VectorEmbeddings/Indexables/Post.php

View workflow job for this annotation

GitHub Actions / PHP Lint

Array unpacking within array declarations using the spread operator is not supported in PHP 7.3 or earlier. Found: ...$this
}
}

$meta_fields = array_reduce(
$meta_fields = $this->get_embeddable_meta( $post_id, $post->post_type );
if ( $meta_fields ) {
$post_meta_str = $this->get_post_meta( $post, $meta_fields );
if ( $post_meta_str ) {
$chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ];

Check failure on line 142 in includes/classes/Feature/VectorEmbeddings/Indexables/Post.php

View workflow job for this annotation

GitHub Actions / PHP Lint

Array unpacking within array declarations using the spread operator is not supported in PHP 7.3 or earlier. Found: ...$chunks

Check failure on line 142 in includes/classes/Feature/VectorEmbeddings/Indexables/Post.php

View workflow job for this annotation

GitHub Actions / PHP Lint

Array unpacking within array declarations using the spread operator is not supported in PHP 7.3 or earlier. Found: ...$this
}
}

return $chunks;
}

/**
* Return the list of taxonomies that should be included in the post representation.
*
* @param integer $post_id The post ID.
* @param string $post_type The post type.
* @return array
*/
protected function get_embeddable_taxonomies( int $post_id, string $post_type ): array {
$search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' );
$weighting = $search_feature->weighting->get_weighting_configuration_with_defaults();
if ( empty( $weighting[ $post_type ] ) ) {
/**
* Filter the list of taxonomies which terms should be included in the post representation.
*
* @hook ep_openai_embeddings_post_embeddable_taxonomies
* @since 2.4.0
*
* @param {array} $embeddable_taxonomies Array of taxonomy names.
* @param {int} $post_id The post ID.
* @param {string} $post_type The post type.
* @return {array} The list of taxonomy names.
*/
return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', [], $post_id, $post_type );
}

$post_type_weighting = $weighting[ $post_type ];

$taxonomies = array_reduce(
array_keys( $post_type_weighting ),
function ( $acc, $field ) use ( $post_type_weighting ) {
if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) {
if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) {
$acc[] = $matches[1];
}
return $acc;
},
[]
);
if ( $meta_fields ) {
$post_meta_str = $this->get_post_meta( $post, $meta_fields );
if ( $post_meta_str ) {
$chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ];
}
}

return $chunks;
// This filter is documented above.
return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', $taxonomies, $post_id, $post_type );
}

/**
Expand Down Expand Up @@ -185,11 +217,63 @@ function ( $term ) {
}
}

return $post_terms_str;
/**
* Filter the string that represents the list of terms associated with this post.
*
* @hook ep_openai_embeddings_post_terms_str
* @since 2.4.0
*
* @param {string} $post_terms_str String with post terms.
* @param {WP_Post} $post The post.
* @return {string} The string with post terms.
*/
return apply_filters( 'ep_openai_embeddings_post_terms_str', $post_terms_str, $post );
}

/**
* Return the list of metafields that should be included in the post representation.
*
* @param integer $post_id The post ID.
* @param string $post_type The post type.
* @return array
*/
protected function get_embeddable_meta( int $post_id, string $post_type ): array {
$search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' );
$weighting = $search_feature->weighting->get_weighting_configuration_with_defaults();
if ( empty( $weighting[ $post_type ] ) ) {
/**
* Filter the list of metafields which values should be included in the post representation.
*
* @hook ep_openai_embeddings_post_embeddable_meta
* @since 2.4.0
*
* @param {array} $embeddable_meta Array of meta keys.
* @param {int} $post_id The post ID.
* @param {string} $post_type The post type.
* @return {array} The list of meta keys.
*/
return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', [], $post_id, $post_type );
}

$post_type_weighting = $weighting[ $post_type ];

$meta_fields = array_reduce(
array_keys( $post_type_weighting ),
function ( $acc, $field ) use ( $post_type_weighting ) {
if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) {
$acc[] = $matches[1];
}
return $acc;
},
[]
);

// This filter is documented above.
return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', $meta_fields, $post_id, $post_type );
}

/**
* Get te representation of the post meta.
* Get the representation of the post meta.
*
* @param \WP_Post $post The post object
* @param array $meta_fields List of metafields
Expand All @@ -210,6 +294,16 @@ protected function get_post_meta( $post, $meta_fields ): string {
}
}

return $meta_str;
/**
* Filter the string that represents the meta fields associated with this post.
*
* @hook ep_openai_embeddings_post_meta_str
* @since 2.4.0
*
* @param {string} $post_terms_str String with post terms.
* @param {WP_Post} $post The post.
* @return {string} The string with post terms.
*/
return apply_filters( 'ep_openai_embeddings_post_meta_str', $meta_str, $post );
}
}
Loading

0 comments on commit cc83899

Please sign in to comment.