Skip to content

Commit

Permalink
Rewrite certain parts
Browse files Browse the repository at this point in the history
  • Loading branch information
vshampor committed Oct 26, 2024
1 parent 7a7dfda commit 9d4f3bb
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 74 deletions.
46 changes: 27 additions & 19 deletions src/cpp/src/cache_eviction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,11 @@ class CacheEvictionAlgorithm {

class CacheRotationCalculator {
public:
CacheRotationCalculator(size_t block_size, size_t max_context_length, size_t kv_head_size, double rope_theta = 10000.0f) : m_block_size(block_size) {
CacheRotationCalculator(size_t block_size, size_t max_context_length, size_t kv_head_size, double rope_theta = 10000.0f) : m_block_size(block_size), m_head_size(kv_head_size) {
size_t max_position_angle_multiplier = max_context_length / 2 + 1; // adding +1 here and below for good measure in case of odd dividends
size_t num_freqs = kv_head_size / 2 + 1;
m_rope_sin_lut.reserve(max_position_angle_multiplier);
m_rope_cos_lut.reserve(max_position_angle_multiplier);
m_rope_sin_lut.resize(max_position_angle_multiplier);
m_rope_cos_lut.resize(max_position_angle_multiplier);

for (size_t i = 0; i < max_position_angle_multiplier; i++) {
m_rope_sin_lut[i].reserve(num_freqs);
Expand All @@ -138,45 +138,53 @@ class CacheRotationCalculator {
};

using RotationCoefficientsPerToken = std::vector<std::vector<double>>;
std::pair<RotationCoefficientsPerToken, RotationCoefficientsPerToken> get_rotation_multipliers(const std::set<size_t>& evicted_block_logical_indices, size_t num_logical_blocks_before_eviction) {
std::pair<RotationCoefficientsPerToken, RotationCoefficientsPerToken> retval;
struct BlockRotationData {
size_t logical_block_idx;
RotationCoefficientsPerToken sines;
RotationCoefficientsPerToken cosines;
};
std::vector<BlockRotationData> get_rotation_multipliers(const std::set<size_t>& evicted_block_logical_indices, size_t num_logical_blocks_before_eviction) {
std::vector<BlockRotationData> retval;
if (evicted_block_logical_indices.empty()) {
return retval;
}

retval.reserve(num_logical_blocks_before_eviction - evicted_block_logical_indices.size());

ptrdiff_t current_rotation_delta_in_positions = 0;
std::vector<size_t> logical_block_space(num_logical_blocks_before_eviction);
std::iota(logical_block_space.begin(), logical_block_space.end(), 0);

std::vector<ptrdiff_t> rotation_deltas;
rotation_deltas.reserve(num_logical_blocks_before_eviction - evicted_block_logical_indices.size());

for (size_t logical_block_idx : logical_block_space) {
if (evicted_block_logical_indices.find(logical_block_idx) != evicted_block_logical_indices.end()) {
current_rotation_delta_in_positions += 1;
}
else {
if (current_rotation_delta_in_positions != 0) {
rotation_deltas.push_back(current_rotation_delta_in_positions);
BlockRotationData block_rotation_data;
block_rotation_data.logical_block_idx = logical_block_idx;
block_rotation_data.cosines.reserve(m_block_size / 2);
block_rotation_data.sines.reserve(m_block_size / 2);
for (size_t i = 0; i < m_block_size / 2; i++) {
block_rotation_data.cosines.push_back(m_rope_cos_lut[current_rotation_delta_in_positions]);
block_rotation_data.sines.push_back(m_rope_sin_lut[current_rotation_delta_in_positions]);
}

retval.push_back(block_rotation_data);
}
}
}

size_t num_tokens_to_rotate = rotation_deltas.size() * m_block_size;
retval.first.reserve(num_tokens_to_rotate);
retval.second.reserve(num_tokens_to_rotate);
for (ptrdiff_t delta : rotation_deltas) {
for (size_t i = 0; i < m_block_size; i++) {
retval.first.push_back(m_rope_cos_lut[delta]);
retval.second.push_back(m_rope_sin_lut[delta]);
}
}

return retval;
}

size_t get_head_size() const {
return m_head_size;
}

private:
size_t m_block_size;
size_t m_head_size;
std::vector<std::vector<double>> m_rope_sin_lut;
std::vector<std::vector<double>> m_rope_cos_lut;
};
Expand Down
75 changes: 49 additions & 26 deletions src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,11 +76,14 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::init(
m_model_runner = std::make_shared<ModelRunner>(infer_request, updated_config, device_config.get_num_layers(),
/* m_collect_attention_scores = */ true);
m_rotation_coefficient_stores.reserve(device_config.get_num_layers());
ov::Shape rotation_coefficient_store_shape{ device_config.get_head_size(), scheduler_config.block_size * scheduler_config.num_kv_blocks };
ov::Shape rotation_coefficient_store_shape{ device_config.get_head_size() * (scheduler_config.block_size * scheduler_config.num_kv_blocks) };
for (size_t i = 0; i < device_config.get_num_layers(); i++) {
ov::Tensor store(device_config.get_cache_precision(), rotation_coefficient_store_shape);
ov::Tensor store(ov::element::f32, rotation_coefficient_store_shape);
std::memset(store.data(), 0, store.get_byte_size());
m_rotation_coefficient_stores.push_back(store);
}
m_next_step_rotation_coefficients.resize(device_config.get_num_layers());
m_next_step_rotated_block_logical_indices_per_sequence.resize(device_config.get_num_layers());
m_cache_rotation_calculator = std::make_shared<CacheRotationCalculator>(scheduler_config.block_size,
// TODO (vshampor): LUT size equal to max cache size in tokens
// is overkill - find a way to pass the max sequence length instead
Expand Down Expand Up @@ -205,7 +208,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
// evict unimportant blocks from KV cache, if requested
if (sched_config.use_cache_eviction) {
maybe_evict_cache_blocks(sched_config);
m_model_runner->set_cache_rotation_coefficients(m_next_step_rotation_coefficients);
m_model_runner->set_cache_rotation_data(m_next_step_rotation_coefficients, m_next_step_rotated_block_logical_indices_per_sequence);
}

#ifdef DEBUG_CACHE_STATE_DUMP
Expand Down Expand Up @@ -389,12 +392,20 @@ float ContinuousBatchingPipeline::ContinuousBatchingImpl::_get_current_running_a
void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_blocks(const SchedulerConfig& sched_config) {
std::unordered_map<SequenceGroup::Ptr, size_t> seq_group_to_num_blocks_evicted_map;
auto sequence_attention_scores = m_model_runner->get_last_attention_scores();

OPENVINO_ASSERT(!sequence_attention_scores.empty());
size_t num_decoder_layers = sequence_attention_scores.begin()->second.size();
std::vector<size_t> num_blocks_to_rotate_for_each_layer(num_decoder_layers, 0);
size_t head_size = m_cache_rotation_calculator->get_head_size();

m_next_step_rotation_coefficients.clear();
m_next_step_rotated_block_logical_indices_per_sequence.clear();
m_next_step_rotated_block_logical_indices_per_sequence.resize(num_decoder_layers);

for (auto& seq_id_and_attention_scores : sequence_attention_scores) {
auto seq_id = seq_id_and_attention_scores.first;
const auto& attention_scores_for_all_decoder_layers = seq_id_and_attention_scores.second;
if (m_seq_group_id_to_cache_eviction_algo_map.find(seq_id) == m_seq_group_id_to_cache_eviction_algo_map.end()) {
auto num_decoder_layers = attention_scores_for_all_decoder_layers.size();

m_seq_group_id_to_cache_eviction_algo_map[seq_id] = CacheEvictionAlgorithm(sched_config.cache_eviction_config, sched_config.block_size, num_decoder_layers);
}
auto& cache_eviction_algo = m_seq_group_id_to_cache_eviction_algo_map[seq_id];
Expand All @@ -403,31 +414,37 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
auto logical_blocks_to_evict = cache_eviction_algo.evict_logical_blocks();


for (size_t i = 0; i < logical_blocks_to_evict.size(); i++) {
size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[i].size();
for (size_t layer_idx = 0; layer_idx < logical_blocks_to_evict.size(); layer_idx++) {
if (logical_blocks_to_evict[layer_idx].empty()) {
continue;
}
size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[layer_idx].size();
auto rotation_multipliers =
m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[i],
m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[layer_idx],
num_blocks_before_eviction);
const auto& rotation_multipliers_cos = rotation_multipliers.first;
const auto& rotation_multipliers_sin = rotation_multipliers.second;
OPENVINO_ASSERT(rotation_multipliers_cos.size() == rotation_multipliers_sin.size());
const size_t num_kv_heads = m_rotation_coefficient_stores[i].get_shape()[0];
const size_t num_tokens = rotation_multipliers_cos.size() * 2;

ov::Tensor rotation_multipliers_tensor(m_rotation_coefficient_stores[i],
ov::Coordinate{0, 0},
ov::Coordinate{num_kv_heads, num_tokens});

// Fill the ROI tensor with rotation coefficient data - cos and sin coefficients are interleaved.
auto rotation_multipliers_tensor_data = rotation_multipliers_tensor.data<float>();
for (size_t head_idx = 0; head_idx < num_kv_heads; head_idx++) {
for (size_t pos_idx = 0; pos_idx < rotation_multipliers_cos.size(); pos_idx++) {
size_t head_offset = head_idx * num_tokens;
rotation_multipliers_tensor_data[head_offset + 2 * pos_idx] = rotation_multipliers_cos[head_idx][pos_idx];
rotation_multipliers_tensor_data[head_offset + 2 * pos_idx + 1] = rotation_multipliers_sin[head_idx][pos_idx];
for (size_t rotated_block_idx = 0; rotated_block_idx < rotation_multipliers.size(); rotated_block_idx++) {
const auto& block_rotation_data = rotation_multipliers[rotated_block_idx];
const auto& rotation_multipliers_cos = block_rotation_data.cosines;
const auto& rotation_multipliers_sin = block_rotation_data.sines;
OPENVINO_ASSERT(rotation_multipliers_cos.size() == rotation_multipliers_sin.size());
OPENVINO_ASSERT(rotation_multipliers_cos.size() * 2 == sched_config.block_size);

m_next_step_rotated_block_logical_indices_per_sequence[layer_idx][seq_id].push_back(block_rotation_data.logical_block_idx);

// Fill the store tensor with rotation coefficient data - cos and sin coefficients are interleaved
// NB: the order of seq_id in each per-sequence iteration of the `for (auto& seq_id_and_attention_scores ...` must be the same
// as the order of seq_ids in which the "rotated_block_indices.N" inputs are filled
size_t sequence_offset = num_blocks_to_rotate_for_each_layer[layer_idx] * sched_config.block_size * head_size;
auto rotation_multipliers_tensor_data = m_rotation_coefficient_stores[layer_idx].data<float>() + sequence_offset;
for (size_t tok_idx = 0; tok_idx < rotation_multipliers_cos.size(); tok_idx++) {
size_t position_offset = head_size * tok_idx;
for (size_t embedding_pair_idx = 0; embedding_pair_idx < head_size / 2; embedding_pair_idx++) {
rotation_multipliers_tensor_data[position_offset + 2 * embedding_pair_idx] = rotation_multipliers_cos[tok_idx][embedding_pair_idx];
rotation_multipliers_tensor_data[position_offset + 2 * embedding_pair_idx + 1] = rotation_multipliers_sin[tok_idx][embedding_pair_idx];
}
}
num_blocks_to_rotate_for_each_layer[layer_idx] += 1;
}
m_next_step_rotation_coefficients[i] = rotation_multipliers_tensor;
}

m_scheduler->free_blocks_from_sequence(seq_id, logical_blocks_to_evict);
Expand All @@ -444,6 +461,12 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
}

}

// Select the previously filled rotation coefficients from the store tensor
for (size_t i = 0; i < num_decoder_layers; i++) {
m_next_step_rotation_coefficients.emplace_back(m_rotation_coefficient_stores[i], ov::Coordinate{0}, ov::Coordinate{num_blocks_to_rotate_for_each_layer[i] * head_size * sched_config.block_size});
}

for (const auto& seq_group_ptr_and_num_blocks_evicted : seq_group_to_num_blocks_evicted_map) {
// Assuming that the evicted blocks are always full (since they by design are only selected from intermediate-age blocks)
auto seq_group_ptr = seq_group_ptr_and_num_blocks_evicted.first;
Expand Down
7 changes: 5 additions & 2 deletions src/cpp/src/continuous_batching_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc

static const size_t AVG_CACHE_USAGE_WINDOW_SIZE_IN_STEPS = 1000;
std::deque<float> m_previous_step_cache_usages;

// flag to enable validation mode for sampler
bool m_is_validation_mode_enabled = false;

Expand All @@ -37,6 +37,9 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
// re-rotation coefficients to be sent to the proper model inputs at the *next* pipeline step.
std::vector<ov::Tensor> m_next_step_rotation_coefficients;

using SeqIdToRotatedLogicalBlocksMap = std::map<size_t, std::vector<size_t>>;
std::vector<SeqIdToRotatedLogicalBlocksMap> m_next_step_rotated_block_logical_indices_per_sequence;

std::shared_ptr<ov::genai::CacheRotationCalculator> m_cache_rotation_calculator;

#ifdef DEBUG_CACHE_STATE_DUMP
Expand Down Expand Up @@ -96,4 +99,4 @@ class ContinuousBatchingPipeline::ContinuousBatchingImpl : public ContinuousBatc
const std::vector<GenerationConfig>& sampling_params,
const StreamerVariant& streamer) override;
};
}
}
Loading

0 comments on commit 9d4f3bb

Please sign in to comment.