Skip to content

Commit

Permalink
Merge pull request #64 from skit-ai/s3-download-file
Browse files Browse the repository at this point in the history
add: AWS s3 helper to download files from s3
  • Loading branch information
sreeram-narayanan authored Jan 15, 2024
2 parents 44800a5 + 911c7cf commit 4e82379
Show file tree
Hide file tree
Showing 3 changed files with 222 additions and 5 deletions.
201 changes: 201 additions & 0 deletions aws/s3.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
package aws

import (
"context"
"io"
"net/url"
"os"
"path/filepath"
"regexp"
"strings"

"github.com/skit-ai/vcore/errors"
"github.com/skit-ai/vcore/log/slog"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/s3"
"github.com/aws/aws-sdk-go/service/s3/s3manager"
)

const (
// Regex for S3 URLs, VPCE interface endpoint
// Source - https://github.com/aws/amazon-ssm-agent/blob/mainline/agent/s3util/s3uri.go
vpceURLPattern = "^((.+)\\.)?" + // maybe a bucket name
"(bucket|accesspoint|control)\\.vpce-[-a-z0-9]+\\." + // VPC endpoint DNS name
"s3[.-]" + // S3 service name
"(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1
"vpce\\." +
"(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)"
vpceURLPatternHostIdx = 0
vpceURLPatternBucketIdx = 2
vpceURLPatternRegionIdx = 5

// Regex for S3 URLs, public S3 endpoint
nonVpceURLPattern = "^((.+)\\.)?" + // maybe a bucket name
"s3[.-](website[-.])?(accelerate\\.)?(dualstack[-.])?" + // S3 service name with optional features
"(([-a-z0-9]+)\\.)?" + // region name, optional for us-east-1
"(amazonaws\\.com|c2s\\.ic\\.gov|sc2s\\.sgov\\.gov)"
nonVpceURLPatternBucketIdx = 2
nonVpceURLPatternRegionIdx = 7
)

var (
vpceUrlRegex = regexp.MustCompile(vpceURLPattern)
nonVpceUrlRegex = regexp.MustCompile(nonVpceURLPattern)
)

// S3URL holds interesting pieces after parsing a s3 URL
type S3URL struct {
IsPathStyle bool
EndPoint string
Bucket string
Key string
Region string
}

// DownloadFile downloads a file from s3 based on the key and writes it into WriteAt.
func (u S3URL) DownloadFile(ctx context.Context, w io.WriterAt) error {
sess, err := session.NewSession(&aws.Config{
Region: aws.String(u.Region), // Specify the region where the bucket is located
Endpoint: aws.String(u.EndPoint),
})
if err != nil {
return errors.NewError("Error creating session", err, false)
}

downloader := s3manager.NewDownloader(sess)

numBytes, err := downloader.DownloadWithContext(ctx, w, &s3.GetObjectInput{
Bucket: aws.String(u.Bucket),
Key: aws.String(u.Key),
})

if err != nil {
return errors.NewError("Error downloading file", err, false)
}

slog.Debug("Downloaded file", "size", numBytes)

return nil
}

// ParseAmazonS3URL parses an HTTP/HTTPS URL for an S3 resource and returns an
// S3URL object.
//
// S3 URLs come in two flavors: virtual hosted-style URLs and path-style URLs.
// Virtual hosted-style URLs have the bucket name as the first component of the
// hostname, e.g.
//
// https://mybucket.s3.us-east-1.amazonaws.com/a/b/c
//
// Path-style URLs have the bucket name as the first component of the path, e.g.
//
// https://s3.us-east-1.amazonaws.com/mybucket/a/b/c
func ParseAmazonS3URL(s3URL *url.URL) (S3URL, error) {
output, err := parseBucketAndRegionFromHost(s3URL.Host)
if err != nil {
return S3URL{}, errors.NewError("parsing host failed", err, false)
}

output.IsPathStyle = output.Bucket == ""

path := s3URL.Path

if output.IsPathStyle {
// no bucket name in the authority, parse it from the path
output.IsPathStyle = true

// grab the encoded path so we don't run afoul of '/'s in the bucket name
if path == "/" || path == "" {
} else {
path = path[1:]
index := strings.Index(path, "/")
if index == -1 {
// https://s3.amazonaws.com/bucket
output.Bucket = path
output.Key = ""
} else if index == (len(path) - 1) {
// https://s3.amazonaws.com/bucket/
output.Bucket = strings.TrimRight(path, "/")
output.Key = ""
} else {
// https://s3.amazonaws.com/bucket/key
output.Bucket = path[:index]
output.Key = path[index+1:]
}
}
} else {
// bucket name in the host, path is the object key
if path == "/" || path == "" {
output.Key = ""
} else {
output.Key = path[1:]
}
}

if strings.EqualFold(output.Region, "external-1") {
output.Region = "us-east-1"
} else if output.Region == "" {
// s3 bucket URL in us-east-1 doesn't include region
output.Region = "us-east-1"
}

return output, nil
}

func parseBucketAndRegionFromHost(host string) (S3URL, error) {
result := vpceUrlRegex.FindStringSubmatch(host)
if result != nil && len(result) > vpceURLPatternBucketIdx && len(result) > vpceURLPatternRegionIdx {
return S3URL{
EndPoint: result[vpceURLPatternHostIdx],
Bucket: result[vpceURLPatternBucketIdx],
Region: result[vpceURLPatternRegionIdx],
}, nil
} else {
result = nonVpceUrlRegex.FindStringSubmatch(host)
if result != nil && len(result) > vpceURLPatternBucketIdx && len(result) > vpceURLPatternRegionIdx {
return S3URL{
Bucket: result[nonVpceURLPatternBucketIdx],
Region: result[nonVpceURLPatternRegionIdx],
}, nil
} else {
return S3URL{}, errors.NewError("failed to match URL", nil, false)
}
}
}

// DownloadFileFromS3 takes an S3 URL and a filePath, downloads the file from s3 and stores it in the filePath.
func DownloadFileFromS3(ctx context.Context, downloadURL, filePath string) error {
parsedURL, err := url.Parse(downloadURL)
if err != nil {
return errors.NewError("Failed to parse URL", err, false)
}

// Parse s3 URL to extract region, key and bucket.
s3URL, err := ParseAmazonS3URL(parsedURL)
if err != nil {
return errors.NewError("Failed to parse URL as s3 URL", err, false)
}

// Create file path
err = os.MkdirAll(filepath.Dir(filePath), os.ModePerm)
if err != nil {
return errors.NewError("Unable to create directory", err, false)
}

// Create a local file to write to
f, err := os.Create(filePath)
if err != nil {
return errors.NewError("Error creating file", err, false)
}

defer func() {
// Ensure file is closed even if an error occurs
if f != nil {
f.Close()
}
}()

return s3URL.DownloadFile(ctx, f)
}
10 changes: 5 additions & 5 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.19

require (
github.com/Vernacular-ai/gorm v1.11.3
github.com/aws/aws-sdk-go v1.44.153
github.com/aws/aws-sdk-go v1.49.15
github.com/getsentry/sentry-go v0.15.0
github.com/go-kit/log v0.2.1
github.com/google/go-cmp v0.5.9
Expand Down Expand Up @@ -94,11 +94,11 @@ require (
go.opentelemetry.io/proto/otlp v0.19.0 // indirect
go.uber.org/atomic v1.10.0 // indirect
go.uber.org/multierr v1.8.0 // indirect
golang.org/x/crypto v0.3.0 // indirect
golang.org/x/net v0.3.0 // indirect
golang.org/x/crypto v0.17.0 // indirect
golang.org/x/net v0.17.0 // indirect
golang.org/x/oauth2 v0.2.0 // indirect
golang.org/x/sys v0.3.0 // indirect
golang.org/x/text v0.5.0 // indirect
golang.org/x/sys v0.15.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
golang.org/x/xerrors v0.0.0-20220907171357-04be3eba64a2 // indirect
google.golang.org/api v0.103.0 // indirect
Expand Down
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgI
github.com/aws/aws-sdk-go v1.44.122/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo=
github.com/aws/aws-sdk-go v1.44.153 h1:KfN5URb9O/Fk48xHrAinrPV2DzPcLa0cd9yo1ax5KGg=
github.com/aws/aws-sdk-go v1.44.153/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI=
github.com/aws/aws-sdk-go v1.49.15 h1:aH9bSV4kL4ziH0AMtuYbukGIVebXddXBL0cKZ1zj15k=
github.com/aws/aws-sdk-go v1.49.15/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q=
github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8=
Expand Down Expand Up @@ -687,6 +689,10 @@ golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/crypto v0.3.0 h1:a06MkbcxBrEFc0w0QIZWXrH/9cCX6KJyWbBOIwAn+7A=
golang.org/x/crypto v0.3.0/go.mod h1:hebNnKkNXi2UzZN1eVRvBB7co0a+JxK6XbPiWVs/3J4=
golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc=
golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4=
golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k=
golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4=
golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
Expand Down Expand Up @@ -777,6 +783,8 @@ golang.org/x/net v0.0.0-20221014081412-f15817d10f9b/go.mod h1:YDH+HFinaLZZlnHAfS
golang.org/x/net v0.1.0/go.mod h1:Cx3nUiGt4eDBEyega/BKRp+/AlGL8hYe7U9odMt2Cco=
golang.org/x/net v0.3.0 h1:VWL6FNY2bEEmsGVKabSlHu5Irp34xmMRoqb/9lF9lxk=
golang.org/x/net v0.3.0/go.mod h1:MBQ8lrhLObU/6UmLb4fmbmk5OcyYmqtbGd/9yIeKjEE=
golang.org/x/net v0.17.0 h1:pVaXccu2ozPjCXewfr1S7xza/zcXTity9cCdXQYSjIM=
golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE=
golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
Expand Down Expand Up @@ -894,6 +902,10 @@ golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBc
golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.3.0 h1:w8ZOecv6NaNa/zC8944JTU3vz4u6Lagfk4RPQxv92NQ=
golang.org/x/sys v0.3.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE=
golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc=
golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.1.0/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
Expand All @@ -909,6 +921,10 @@ golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.4.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.5.0 h1:OLmvp0KP+FVG99Ct/qFiL/Fhk4zp4QQnZ7b2U+5piUM=
golang.org/x/text v0.5.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k=
golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ=
Expand Down

0 comments on commit 4e82379

Please sign in to comment.