diff --git a/examples/anthropic-vision-example/README.md b/examples/anthropic-vision-example/README.md new file mode 100644 index 000000000..56a192917 --- /dev/null +++ b/examples/anthropic-vision-example/README.md @@ -0,0 +1,34 @@ +# Anthropic Vision Example + +Hello there! 👋 This example demonstrates how to use the Anthropic Claude 3 Claude 3 Sonnet model for image analysis using Go and the LangChain Go library. Let's break down what this exciting code does! + +## What This Example Does + +1. **Sets Up Anthropic**: The code initializes an Anthropic client to interact with the Claude 3 Sonnet model. + +2. **Loads an Image**: An image file (`image.png`) is embedded into the binary using Go's `embed` package. This image will be analyzed by the AI model. + +3. **Sends a Request**: The code constructs a request to the Claude 3 model, including: + - The image data in a base64 encoded string (in PNG format) + - A text prompt asking to identify the string on a box in the image + +4. **Processes the Response**: After sending the request, the code handles the response from the AI model, extracting the generated content and some metadata about token usage. + +5. **Outputs Results**: Finally, it prints out the AI's interpretation of what string is on the box in the image. + +## Key Features + +- **Multimodal AI**: This example showcases how to work with both image and text inputs in a single AI request. +- **Error Handling**: Includes basic error checking to ensure the process runs smoothly. +- **Token Usage Tracking**: Logs the number of input and output tokens used, which can be helpful for monitoring usage and costs. + +## Running the Example + +To run this example, you'll need: + +1. An Anthropic API KEY set up in your environment variables +2. The required Go dependencies installed + +Once everything is set up, simply run the Go file, and it should output the AI's interpretation of the text on the box in the image! + +Happy coding, and enjoy exploring the fascinating world of multimodal AI with Claude 3! 🚀🖼️🤖 diff --git a/examples/anthropic-vision-example/anthropic_vision_example.go b/examples/anthropic-vision-example/anthropic_vision_example.go new file mode 100644 index 000000000..30634f338 --- /dev/null +++ b/examples/anthropic-vision-example/anthropic_vision_example.go @@ -0,0 +1,59 @@ +package main + +import ( + "context" + _ "embed" + "fmt" + "log" + + "github.com/tmc/langchaingo/llms" + "github.com/tmc/langchaingo/llms/anthropic" +) + +//go:embed image.png +var image []byte + +func main() { + llm, err := anthropic.New( + anthropic.WithModel("claude-3-5-sonnet-20240620"), + ) + if err != nil { + log.Fatal(err) + } + ctx := context.Background() + resp, err := llm.GenerateContent( + ctx, + []llms.MessageContent{ + { + Role: llms.ChatMessageTypeHuman, + Parts: []llms.ContentPart{ + // For images, you can use image formats such as image/png, image/jpeg, image/gif, image/webp. + // Please change according to the actual byte array to be given. + // for more detailes, see this https://docs.anthropic.com/claude/reference/messages_post + llms.BinaryPart("image/png", image), + llms.TextPart("Please tell me the string on the box."), + }, + }, + }, + llms.WithMaxTokens(1000), + llms.WithTemperature(0.1), + llms.WithTopP(1.0), + llms.WithTopK(100), + ) + if err != nil { + log.Fatal(err) + } + choices := resp.Choices + if len(choices) < 1 { + log.Fatal("empty response from model") + } + + log.Printf( + "input_tokens: %d, output_tokens: %d", + choices[0].GenerationInfo["InputTokens"], + choices[0].GenerationInfo["OutputTokens"], + ) + fmt.Println(choices[0].Content) + // Output: + // The string on the box in the image is "LGTM". +} diff --git a/examples/anthropic-vision-example/go.mod b/examples/anthropic-vision-example/go.mod new file mode 100644 index 000000000..43d84a051 --- /dev/null +++ b/examples/anthropic-vision-example/go.mod @@ -0,0 +1,13 @@ +module github.com/tmc/langchaingo/examples/bedrock-claude3-vision-example + +go 1.22.0 + +toolchain go1.22.1 + +require github.com/tmc/langchaingo v0.1.13-pre.1 + +require ( + github.com/dlclark/regexp2 v1.10.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/pkoukk/tiktoken-go v0.1.6 // indirect +) diff --git a/examples/anthropic-vision-example/go.sum b/examples/anthropic-vision-example/go.sum new file mode 100644 index 000000000..e92326ed5 --- /dev/null +++ b/examples/anthropic-vision-example/go.sum @@ -0,0 +1,22 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0= +github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw= +github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tmc/langchaingo v0.1.13-pre.1 h1:r+ma9kl0NuFJGtIrnMPFjEn4RhXktwSI31fIpgiiMm4= +github.com/tmc/langchaingo v0.1.13-pre.1/go.mod h1:vpQ5NOIhpzxDfTZK9B6tf2GM/MoaHewPWM5KXXGh7hg= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo= +sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8= diff --git a/examples/anthropic-vision-example/image.png b/examples/anthropic-vision-example/image.png new file mode 100644 index 000000000..27da75c58 Binary files /dev/null and b/examples/anthropic-vision-example/image.png differ diff --git a/llms/anthropic/anthropicllm.go b/llms/anthropic/anthropicllm.go index cddb7ce05..5c5617407 100644 --- a/llms/anthropic/anthropicllm.go +++ b/llms/anthropic/anthropicllm.go @@ -2,6 +2,7 @@ package anthropic import ( "context" + "encoding/base64" "encoding/json" "errors" "fmt" @@ -266,13 +267,37 @@ func handleSystemMessage(msg llms.MessageContent) (string, error) { } func handleHumanMessage(msg llms.MessageContent) (anthropicclient.ChatMessage, error) { - if textContent, ok := msg.Parts[0].(llms.TextContent); ok { - return anthropicclient.ChatMessage{ - Role: RoleUser, - Content: textContent.Text, - }, nil + var contents []anthropicclient.Content + + for _, part := range msg.Parts { + switch p := part.(type) { + case llms.TextContent: + contents = append(contents, &anthropicclient.TextContent{ + Type: "text", + Text: p.Text, + }) + case llms.BinaryContent: + contents = append(contents, &anthropicclient.ImageContent{ + Type: "image", + Source: anthropicclient.ImageSource{ + Type: "base64", + MediaType: p.MIMEType, + Data: base64.StdEncoding.EncodeToString(p.Data), + }, + }) + default: + return anthropicclient.ChatMessage{}, fmt.Errorf("anthropic: unsupported human message part type: %T", part) + } + } + + if len(contents) == 0 { + return anthropicclient.ChatMessage{}, fmt.Errorf("anthropic: no valid content in human message") } - return anthropicclient.ChatMessage{}, fmt.Errorf("anthropic: %w for human message", ErrInvalidContentType) + + return anthropicclient.ChatMessage{ + Role: RoleUser, + Content: contents, + }, nil } func handleAIMessage(msg llms.MessageContent) (anthropicclient.ChatMessage, error) { diff --git a/llms/anthropic/internal/anthropicclient/messages.go b/llms/anthropic/internal/anthropicclient/messages.go index 3cdd22af5..d4f87ab49 100644 --- a/llms/anthropic/internal/anthropicclient/messages.go +++ b/llms/anthropic/internal/anthropicclient/messages.go @@ -66,6 +66,21 @@ func (tc TextContent) GetType() string { return tc.Type } +type ImageContent struct { + Type string `json:"type"` + Source ImageSource `json:"source"` +} + +func (ic ImageContent) GetType() string { + return ic.Type +} + +type ImageSource struct { + Type string `json:"type"` + MediaType string `json:"media_type"` + Data string `json:"data"` +} + type ToolUseContent struct { Type string `json:"type"` ID string `json:"id"`