Skip to content

Commit

Permalink
Merge pull request #962 from martindevans/nov_binaries
Browse files Browse the repository at this point in the history
November Binary Update
martindevans authored Nov 7, 2024
2 parents 079410c + 5b8906b commit b321839
Showing 14 changed files with 108 additions and 258 deletions.
4 changes: 4 additions & 0 deletions .github/_typos.toml
Original file line number Diff line number Diff line change
@@ -14,3 +14,7 @@ extend-exclude = [
"LLama.Benchmark/Assets/",
"LLama.Examples/Assets/"
]

[default.extend-words]
# Used in a comment in SafeLLamaSamplerHandle.cs, as a prefix of "hello"
teh = "hel"
1 change: 0 additions & 1 deletion LLama.Examples/Examples/CustomSampler.cs
Original file line number Diff line number Diff line change
@@ -60,7 +60,6 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
chain.AddCustom(new RemoveMostLikelyToken());

// Select from the distribution
chain.AddSoftmax();
chain.AddDistributionSampler(42);

return chain;
1 change: 0 additions & 1 deletion LLama/Extensions/LLamaExecutorExtensions.cs
Original file line number Diff line number Diff line change
@@ -150,7 +150,6 @@ private string CreatePrompt(IList<ChatMessage> messages)
MinKeep = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinKeep), out int mk) is true ? mk : s_defaultPipeline.MinKeep,
MinP = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.MinP), out float mp) is true ? mp : s_defaultPipeline.MinP,
Seed = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.Seed), out uint seed) is true ? seed : (uint)(t_random ??= new()).Next(),
TailFreeZ = options?.AdditionalProperties?.TryGetValue(nameof(DefaultSamplingPipeline.TailFreeZ), out float tfz) is true ? tfz : s_defaultPipeline.TailFreeZ,
Temperature = options?.Temperature ?? 0,
TopP = options?.TopP ?? 0,
TopK = options?.TopK ?? s_defaultPipeline.TopK,
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
@@ -56,7 +56,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>c35e586ea5722184</BinaryReleaseId>
<BinaryReleaseId>958367bf530d943a90</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
3 changes: 2 additions & 1 deletion LLama/LLavaWeights.cs
Original file line number Diff line number Diff line change
@@ -9,7 +9,8 @@ namespace LLama;
/// <summary>
/// A set of llava model weights (mmproj), loaded into memory.
/// </summary>
public sealed class LLavaWeights : IDisposable
public sealed class LLavaWeights
: IDisposable
{
/// <summary>
/// The native handle, which is used in the native APIs
10 changes: 3 additions & 7 deletions LLama/Native/LLamaNativeBatch.cs
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@ public unsafe struct LLamaNativeBatch

/// <summary>
/// the positions of the respective token in the sequence
/// (if set to NULL, the token position will be tracked automatically by llama_decode)
/// </summary>
public LLamaPos* pos;

@@ -35,18 +36,13 @@ public unsafe struct LLamaNativeBatch

/// <summary>
/// the sequence to which the respective token belongs
/// (if set to NULL, the sequence ID will be assumed to be 0)
/// </summary>
public LLamaSeqId** seq_id;

/// <summary>
/// if zero, the logits for the respective token will not be output
/// (if set to NULL, only the logits for last token will be returned)
/// </summary>
public byte* logits;

// Note from llama.cpp:
// > helpers for smooth API transition - can be deprecated in the future
// > for future-proof code, use the above fields instead and ignore everything below
private LLamaPos _all_pos_0;
private LLamaPos _all_pos_1;
private LLamaSeqId _all_seq_id;
}
5 changes: 5 additions & 0 deletions LLama/Native/LLamaPoolingType.cs
Original file line number Diff line number Diff line change
@@ -29,4 +29,9 @@ public enum LLamaPoolingType
CLS = 2,

Last = 3,

/// <summary>
/// Used by reranking models to attach the classification head to the graph
/// </summary>
Rank,
}
1 change: 1 addition & 0 deletions LLama/Native/LLamaVocabPreType.cs
Original file line number Diff line number Diff line change
@@ -33,4 +33,5 @@ internal enum LLamaVocabPreType
BLOOM = 23,
GPT3_FINNISH = 24,
EXAONE = 25,
CHAMELEON = 26,
}
186 changes: 0 additions & 186 deletions LLama/Native/NativeApi.Sampling.cs

This file was deleted.

8 changes: 8 additions & 0 deletions LLama/Native/NativeApi.cs
Original file line number Diff line number Diff line change
@@ -49,6 +49,14 @@ public static void llama_empty_call()
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llama_supports_gpu_offload();

/// <summary>
/// Check if RPC offload is supported
/// </summary>
/// <returns></returns>
[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
[return: MarshalAs(UnmanagedType.U1)]
public static extern bool llama_supports_rpc();

/// <summary>
/// Initialize the llama + ggml backend. Call once at the start of the program.
///
6 changes: 4 additions & 2 deletions LLama/Native/SafeLLamaContextHandle.cs
Original file line number Diff line number Diff line change
@@ -368,8 +368,10 @@ static SafeLLamaContextHandle()
private static extern LLamaPoolingType llama_pooling_type(SafeLLamaContextHandle ctx);

/// <summary>
/// Get the embeddings for the a specific sequence.
/// Equivalent to: llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
/// Get the embeddings for a sequence id.
/// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
/// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
/// otherwise: float[n_embd] (1-dimensional)
/// </summary>
/// <returns>A pointer to the first float in an embedding, length = ctx.EmbeddingSize</returns>
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
80 changes: 48 additions & 32 deletions LLama/Native/SafeLLamaSamplerHandle.cs
Original file line number Diff line number Diff line change
@@ -267,19 +267,6 @@ public void AddMirostat2Sampler(uint seed, float tau, float eta)
static extern IntPtr llama_sampler_init_mirostat_v2(uint seed, float tau, float eta);
}


/// <summary>
/// Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// </summary>
/// <returns></returns>
public void AddSoftmax()
{
llama_sampler_chain_add(this, llama_sampler_init_softmax());

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_softmax();
}

/// <summary>
/// Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
/// </summary>
@@ -309,7 +296,6 @@ public void AddTopP(float p, nint minKeep)
/// <summary>
/// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
/// </summary>
/// <returns></returns>
public void AddMinP(float p, nint minKeep)
{
llama_sampler_chain_add(this, llama_sampler_init_min_p(p, minKeep));
@@ -320,24 +306,9 @@ public void AddMinP(float p, nint minKeep)
// ReSharper restore InconsistentNaming
}

/// <summary>
/// Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
/// </summary>
/// <returns></returns>
public void AddTailFree(float z, nint minKeep)
{
llama_sampler_chain_add(this, llama_sampler_init_tail_free(z, minKeep));

// ReSharper disable InconsistentNaming
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_tail_free(float p, nint min_keep);
// ReSharper restore InconsistentNaming
}

/// <summary>
/// Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
/// </summary>
/// <returns></returns>
public void AddTypical(float p, nint minKeep)
{
llama_sampler_chain_add(this, llama_sampler_init_typical(p, minKeep));
@@ -349,14 +320,15 @@ public void AddTypical(float p, nint minKeep)
}

/// <summary>
/// Apply temperature to the logits
/// Apply temperature to the logits.
/// If temperature is less than zero the maximum logit is left unchanged and the rest are set to -infinity
/// </summary>
/// <param name="t"></param>
/// <returns></returns>
public void AddTemperature(float t)
{
llama_sampler_chain_add(this, llama_sampler_init_temp(t));

// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_temp(float t);
}
@@ -367,7 +339,6 @@ public void AddTemperature(float t)
/// <param name="t"></param>
/// <param name="delta"></param>
/// <param name="exponent"></param>
/// <returns></returns>
public void AddDynamicTemperature(float t, float delta, float exponent)
{
llama_sampler_chain_add(this, llama_sampler_init_temp_ext(t, delta, exponent));
@@ -376,6 +347,51 @@ public void AddDynamicTemperature(float t, float delta, float exponent)
static extern IntPtr llama_sampler_init_temp_ext(float t, float delta, float exponent);
}

/// <summary>
/// XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
/// </summary>
/// <param name="p"></param>
/// <param name="t"></param>
/// <param name="minKeep"></param>
/// <param name="seed"></param>
public void AddXTC(float p, float t, int minKeep, uint seed)
{
llama_sampler_chain_add(this, llama_sampler_init_xtc(p, t, minKeep, seed));

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_xtc(float p, float t, nint minKeep, uint seed);
}

/// <summary>
/// This sampler is meant to be used for fill-in-the-middle infilling, after top_k + top_p sampling
///<br />
/// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG<br />
/// 2. combine probs of tokens that have the same prefix<br />
/// <br />
/// example:<br />
/// <br />
/// - before:<br />
/// "abc": 0.5<br />
/// "abcd": 0.2<br />
/// "abcde": 0.1<br />
/// "dummy": 0.1<br />
///<br />
/// - after:<br />
/// "abc": 0.8<br />
/// "dummy": 0.1<br />
///<br />
/// 3. discard non-EOG tokens with low prob<br />
/// 4. if no tokens are left -> pick EOT
/// </summary>
/// <param name="model"></param>
public void AddFillInMiddleInfill(SafeLlamaModelHandle model)
{
llama_sampler_chain_add(this, llama_sampler_init_infill(model));

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
static extern IntPtr llama_sampler_init_infill(SafeLlamaModelHandle model);
}

/// <summary>
/// Create a sampler which makes tokens impossible unless they match the grammar
/// </summary>
52 changes: 32 additions & 20 deletions LLama/Native/SafeLlamaModelHandle.cs
Original file line number Diff line number Diff line change
@@ -386,32 +386,29 @@ private static int llama_model_meta_val_str(SafeLlamaModelHandle model, string k
private static extern LLamaToken llama_token_pad(SafeLlamaModelHandle model);

/// <summary>
/// codellama infill tokens, Beginning of infill prefix
/// codellama infill tokens, End of infill middle
/// </summary>
/// <returns></returns>
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_prefix(SafeLlamaModelHandle model);
private static extern int llama_token_eot(SafeLlamaModelHandle model);

/// <summary>
/// codellama infill tokens, Beginning of infill middle
/// </summary>
/// <returns></returns>
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_middle(SafeLlamaModelHandle model);
private static extern int llama_token_fim_pre(SafeLlamaModelHandle model);

/// <summary>
/// codellama infill tokens, Beginning of infill suffix
/// </summary>
/// <returns></returns>
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_suffix(SafeLlamaModelHandle model);
private static extern int llama_token_fim_suf(SafeLlamaModelHandle model);

/// <summary>
/// codellama infill tokens, End of infill middle
/// </summary>
/// <returns></returns>
[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_eot(SafeLlamaModelHandle model);
private static extern int llama_token_fim_mid(SafeLlamaModelHandle model);

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_fim_pad(SafeLlamaModelHandle model);

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_fim_rep(SafeLlamaModelHandle model);

[DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
private static extern int llama_token_fim_sep(SafeLlamaModelHandle model);

/// <summary>
/// For encoder-decoder models, this function returns id of the token that must be provided
@@ -771,17 +768,32 @@ internal ModelTokens(SafeLlamaModelHandle model)
/// <summary>
/// Codellama beginning of infill prefix
/// </summary>
public LLamaToken? InfillPrefix => Normalize(llama_token_prefix(_model));
public LLamaToken? InfillPrefix => Normalize(llama_token_fim_pre(_model));

/// <summary>
/// Codellama beginning of infill middle
/// </summary>
public LLamaToken? InfillMiddle => Normalize(llama_token_middle(_model));
public LLamaToken? InfillMiddle => Normalize(llama_token_fim_mid(_model));

/// <summary>
/// Codellama beginning of infill suffix
/// </summary>
public LLamaToken? InfillSuffix => Normalize(llama_token_suffix(_model));
public LLamaToken? InfillSuffix => Normalize(llama_token_fim_suf(_model));

/// <summary>
/// Codellama pad
/// </summary>
public LLamaToken? InfillPad => Normalize(llama_token_fim_pad(_model));

/// <summary>
/// Codellama rep
/// </summary>
public LLamaToken? InfillRep => Normalize(llama_token_fim_rep(_model));

/// <summary>
/// Codellama rep
/// </summary>
public LLamaToken? InfillSep => Normalize(llama_token_fim_sep(_model));

/// <summary>
/// Codellama end of infill middle
7 changes: 0 additions & 7 deletions LLama/Sampling/DefaultSamplingPipeline.cs
Original file line number Diff line number Diff line change
@@ -83,11 +83,6 @@ public float AlphaPresence
/// </summary>
public int TopK { get; init; } = 40;

/// <summary>
/// Z value for tail free sampling
/// </summary>
public float TailFreeZ { get; init; } = 1;

/// <summary>
/// P value for locally typical sampling
/// </summary>
@@ -135,13 +130,11 @@ protected override SafeLLamaSamplerChainHandle CreateChain(SafeLLamaContextHandl
);

chain.AddTopK(TopK);
chain.AddTailFree(TailFreeZ, MinKeep);
chain.AddTypical(TypicalP, MinKeep);
chain.AddTopP(TopP, MinKeep);
chain.AddMinP(MinP, MinKeep);
chain.AddTemperature(Temperature);

chain.AddSoftmax();
chain.AddDistributionSampler(Seed);

return chain;

0 comments on commit b321839

Please sign in to comment.