obot-platform · iwilltry42 · Feb 10, 2025 · g-linville · Feb 11, 2025 · iwilltry42
diff --git a/result-formatter/main.go b/result-formatter/main.go
@@ -5,7 +5,6 @@ import (
 	"encoding/json"
 	"fmt"
 	"os"
-	"strconv"
 	"strings"
 	"sync"
 
@@ -22,14 +21,26 @@ type subqueryResults struct {
 }
 
 type document struct {
-	ID       string         `json:"id"`
-	Content  string         `json:"content,omitempty"`
-	Metadata map[string]any `json:"metadata,omitempty"`
+	ID       string   `json:"id"`
+	Content  string   `json:"content,omitempty"`
+	Metadata metadata `json:"metadata,omitempty"`
+}
+
+type metadata struct {
+	Source            string `json:"source,omitempty"`
+	WorkspaceID       string `json:"workspaceID,omitempty"`
+	URL               string `json:"url,omitempty"`
+	Pages             string `json:"pages,omitempty"`
+	Page              int    `json:"page,omitempty"`
+	TotalPages        int    `json:"totalPages,omitempty"`
+	FileSize          int    `json:"fileSize,omitempty"`
+	WorkspaceFileName string `json:"workspaceFileName,omitempty"` // workspaceFileName is the location of the converted file, not the original file - e.g. <path>/foo.pdf.json
 }
 
 type hit struct {
-	URL     string `json:"url,omitempty"`
-	Content string `json:"content,omitempty"`
+	URL      string `json:"url,omitempty"`      // URL should be the original source of the document (Web URL, OneDrive Link, Workspace File, etc.)
+	Location string `json:"location,omitempty"` // Location should be the location of the result in the original source (page numbers, etc.)
+	Content  string `json:"content,omitempty"`  // Content should be the text content of the document
 }
 
 type inputContent struct {
@@ -44,7 +55,7 @@ func main() {
 		ctx               = context.Background()
 	)
 
-	// This is ugly code, I know. Beauty comes later.
+	// This is ugly code, I know. Beauty comes later. Cleaned up a little. Still room for improvement.
 
 	if clientErr != nil {
 		_, _ = fmt.Fprintf(os.Stderr, "failed to create gptscript client: %v\n", clientErr)
@@ -59,7 +70,7 @@ func main() {
 	var (
 		outDocs      []hit
 		wg           sync.WaitGroup
-		fullyFetched = map[string]struct{}{}
+		fullyFetched = map[string]int{} // fullyFetched is a map of files that have been fully fetched from the workspace - the value is the index in outDocs
 		budget       = 120_000
 	)
 
@@ -68,27 +79,53 @@ func main() {
 			break
 		}
 		for _, doc := range result.ResultDocuments {
-			filename, _ := doc.Metadata["workspaceFileName"].(string)
-			if _, ok := fullyFetched[filename]; ok {
+			filename := doc.Metadata.WorkspaceFileName
+
+			// We parse the location regardless of the file potentially being fully fetched already to preserve the
+			// source reference metadata (i.e. where in the document the information was found).
+			// This is a UX thing to help users with manual proofreading of answers.
+			var location string
+			if doc.Metadata.Pages != "" {
+				location = "Pages " + doc.Metadata.Pages
+			} else if doc.Metadata.Page > 0 {
+				location = fmt.Sprintf("Page %d", doc.Metadata.Page)
+			}
+			if location != "" && doc.Metadata.TotalPages > 0 {
+				location = fmt.Sprintf("%s of %d", location, doc.Metadata.TotalPages)
+				_, _ = fmt.Fprintf(os.Stderr, "result doc in file %q at %q\n", filename, location)
+			}
+
+			if ffi, ok := fullyFetched[filename]; ok {
+				if location != "" {
+					outDocs[ffi].Location += " and " + location
+				}
 				continue
 			}
 
-			url, _ := doc.Metadata["url"].(string)
+			// url should be the original source of the document (Web URL, OneDrive Link, Workspace File, etc.)
+			var url string
+			if strings.HasPrefix(doc.Metadata.Source, "ws://") {
+				url = doc.Metadata.Source
+			} else {
+				url = doc.Metadata.URL
+			}
+			_, _ = fmt.Fprintf(os.Stderr, "result doc url %q\n", url)
+
 			outDocs = append(outDocs, hit{
-				URL:     url,
-				Content: doc.Content,
+				URL:      url,
+				Content:  doc.Content,
+				Location: location,
 			})
 
 			index := len(outDocs) - 1
 
 			if index < 3 && clientErr == nil {
-				fileSize, _ := doc.Metadata["fileSize"].(string)
-				size, _ := strconv.Atoi(fileSize)
-				workspaceID, _ := doc.Metadata["workspaceID"].(string)
-				if size > 5_000 && size < budget && workspaceID != "" {
-					_, _ = fmt.Fprintf(os.Stderr, "reading file in workspace: %s\n", filename)
-					fullyFetched[filename] = struct{}{}
-					budget -= size
+				fileSize := doc.Metadata.FileSize
+				workspaceID := doc.Metadata.WorkspaceID
+				if fileSize > 5_000 && fileSize < budget && workspaceID != "" {
+					_, _ = fmt.Fprintf(os.Stderr, "fetching full file %q from workspace: %d bytes\n", filename, fileSize)
+					fullyFetched[filename] = index
+					budget -= fileSize
 					wg.Add(1)
 
 					go func() {
@@ -115,10 +152,11 @@ func main() {
 
 						if buffer.Len() > 0 {
 							outDocs[index].Content = buffer.String()
+							outDocs[index].Location = "Full Document. Specifically " + outDocs[index].Location
 						}
 					}()
 				} else {
-					_, _ = fmt.Fprintf(os.Stderr, "file size is not within the range: %s %s %d %d\n", workspaceID, filename, size, budget)
+					_, _ = fmt.Fprintf(os.Stderr, "file %q size %d is not within range %d\n", fmt.Sprintf("%s/%s", workspaceID, filename), fileSize, budget)
 				}
 			}
 		}