Merge pull request #10 from langchain-ai/brace/report-generator

feat: Implement report and post generators
langchain-ai · Nov 27, 2024 · 50ef560 · 50ef560
2 parents c4b06be + 82730d7
commit 50ef560
Show file tree

Hide file tree

Showing 31 changed files with 1,283 additions and 286 deletions.
diff --git a/package.json b/package.json
@@ -27,9 +27,11 @@
     "@langchain/core": "^0.3.18",
     "@langchain/google-vertexai-web": "^0.1.2",
     "@langchain/langgraph": "^0.2.22",
-    "@mendable/firecrawl-js": "^1.8.5",
+    "@mendable/firecrawl-js": "0.0.36",
     "@slack/web-api": "^7.7.0",
+    "cheerio": "^1.0.0",
     "moment": "^2.30.1",
+    "twitter-api-v2": "^1.18.2",
     "zod": "^3.23.8"
   },
   "devDependencies": {

diff --git a/src/agent/graph.ts b/src/agent/graph.ts
@@ -1,5 +1,5 @@
 import { END, Send, START, StateGraph } from "@langchain/langgraph";
-import { GraphAnnotation } from "./state.js";
+import { ConfigurableAnnotation, GraphAnnotation } from "./state.js";
 import { ingestData } from "./nodes/ingest-data.js";
 import { generatePostGraph } from "./subgraphs/generate-post/graph.js";
 
@@ -23,7 +23,7 @@ function routeAfterIdentifyContent(
   });
 }
 
-const builder = new StateGraph(GraphAnnotation)
+const builder = new StateGraph(GraphAnnotation, ConfigurableAnnotation)
   // Ingests posts from Slack channel.
   .addNode("ingestData", ingestData)
   // Subgraph which is invoked once for each message.

diff --git a/src/agent/nodes/ingest-data.ts b/src/agent/nodes/ingest-data.ts
@@ -6,13 +6,13 @@ import { extractUrlsFromSlackText } from "../utils.js";
 const getChannelIdFromConfig = async (
   config: LangGraphRunnableConfig,
 ): Promise<string | undefined> => {
-  if (config.configurable?.slack.channelName) {
+  if (config.configurable?.slackChannelName) {
     const client = new SlackMessageFetcher({
-      channelName: config.configurable.slack.channelName,
+      channelName: config.configurable.slackChannelName,
     });
     return await client.getChannelId();
   }
-  return config.configurable?.slack.channelId;
+  return config.configurable?.slackChannelId;
 };
 
 export async function ingestData(
@@ -27,8 +27,13 @@ export async function ingestData(
   const client = new SlackMessageFetcher({
     channelId: channelId,
   });
-
-  const recentMessages = await client.fetchLast24HoursMessages();
+  console.log("Before fetching messages");
+  const recentMessages = await client.fetchLast24HoursMessages(
+    config.configurable?.maxMessages,
+  );
+  if (recentMessages.length > 1) {
+    throw new Error("More than one message found");
+  }
   const messagesWithUrls = recentMessages.flatMap((msg) => {
     const links = extractUrlsFromSlackText(msg.text);
     if (!links.length) {
@@ -39,7 +44,7 @@ export async function ingestData(
       links,
     };
   });
-
+  console.log("returning", messagesWithUrls.length, " messages");
   return {
     slackMessages: messagesWithUrls,
   };

diff --git a/src/agent/state.ts b/src/agent/state.ts
@@ -1,4 +1,4 @@
-import { Annotation, MessagesAnnotation } from "@langchain/langgraph";
+import { Annotation } from "@langchain/langgraph";
 import { SimpleSlackMessage } from "../clients/slack.js";
 
 export type LangChainProduct = "langchain" | "langgraph" | "langsmith";
@@ -7,16 +7,10 @@ export type SimpleSlackMessageWithLinks = SimpleSlackMessage & {
 };
 
 export const GraphAnnotation = Annotation.Root({
-  ...MessagesAnnotation.spec,
   /**
    * The Slack messages to use for the content.
    */
   slackMessages: Annotation<SimpleSlackMessageWithLinks[]>,
-  /**
-   * The LangChain product(s) this content is relevant to.
-   * Undefined if it is not relevant to any product.
-   */
-  relevantProducts: Annotation<LangChainProduct[] | undefined>,
   /**
    * A report generated on the content. Will be used in the main
    * graph when generating the post about this content.
@@ -31,3 +25,12 @@ export const GraphAnnotation = Annotation.Root({
    */
   twitterPost: Annotation<string>,
 });
+
+export const ConfigurableAnnotation = Annotation.Root({
+  maxMessages: Annotation<number>({
+    reducer: (_state, update) => update,
+    default: () => 100,
+  }),
+  slackChannelName: Annotation<string | undefined>,
+  slackChannelId: Annotation<string | undefined>,
+});
diff --git a/src/agent/subgraphs/generate-post/state.ts → ...aphs/generate-post/generate-post-state.ts b/src/agent/subgraphs/generate-post/state.ts → ...aphs/generate-post/generate-post-state.ts
@@ -47,11 +47,8 @@ export const GraphAnnotation = Annotation.Root({
     },
     default: () => [],
   }),
-});
-
-export const VerifyContentAnnotation = Annotation.Root({
   /**
-   * The link to the content to verify.
+   * The content of the Tweet/LinkedIn post.
    */
-  link: Annotation<string>,
+  post: Annotation<string>,
 });
diff --git a/src/agent/subgraphs/generate-post/graph.ts b/src/agent/subgraphs/generate-post/graph.ts
@@ -1,30 +1,17 @@
 import { END, Send, START, StateGraph } from "@langchain/langgraph";
-import { GraphAnnotation, VerifyContentAnnotation } from "./state.js";
+import { GraphAnnotation } from "./generate-post-state.js";
 import { generateContentReport } from "./nodes/generate-content-report.js";
-import { verifyGeneralContent } from "./nodes/verify-general.js";
-import { verifyYouTubeContent } from "./nodes/verify-youtube.js";
-import { verifyGitHubContent } from "./nodes/verify-github.js";
-import { generateLinkedinPost } from "./nodes/generate-post/linkedin.js";
-import { generateTwitterPost } from "./nodes/generate-post/twitter.js";
+import { verifyGeneralContent } from "../shared/nodes/verify-general.js";
+import { verifyYouTubeContent } from "../shared/nodes/verify-youtube.js";
+import { verifyGitHubContent } from "../shared/nodes/verify-github.js";
+import { generatePosts } from "./nodes/generate-post.js";
 import { schedulePost } from "./nodes/schedule-post.js";
+import { VerifyContentAnnotation } from "../shared/shared-state.js";
+import { verifyTweetGraph } from "../verify-tweet/graph.js";
 
-/**
- * Should do the following:
- * Handle youtube videos
- * Handle GitHub repos
- * Handle all other content (general purpose web scraping)
- *
- * YouTube videos:
- * 1. use gemini 1.5 flash to ingest youtube video & create a summary
- * 2. pass the summary to claude and have claude identify if it's langchain content
- *
- * GitHub repos:
- * 1a. Pull the readme from the repo, pass to claude and ask to identify if it's LangChain content.
- * 1b. iterate over the first 100 .js|jsx|ts|tsx or .py files, use regex to extract all imports, verify it has LangChain imports.
- *
- * All others:
- * Mayb FireCrawl to scrape the page content. Then pass to an LLM to identify if it's LangChain content.
- */
+const isTwitterUrl = (url: string) => {
+  return url.includes("twitter.com") || url.includes("x.com");
+};
 
 /**
  * This conditional edge will iterate over all the links in a slack message.
@@ -35,24 +22,32 @@ function routeContentTypes(state: typeof GraphAnnotation.State) {
     if (link.includes("youtube.com")) {
       return new Send("verifyYouTubeContent", {
         link,
+        slackMessage: state.slackMessage,
       });
     } else if (link.includes("github.com")) {
       return new Send("verifyGitHubContent", {
         link,
+        slackMessage: state.slackMessage,
+      });
+    } else if (isTwitterUrl(link)) {
+      return new Send("verifyTweetSubGraph", {
+        link,
+        slackMessage: state.slackMessage,
       });
     } else {
       return new Send("verifyGeneralContent", {
         link,
+        slackMessage: state.slackMessage,
       });
     }
   });
 }
 
 function routeAfterGeneratingReport(
   state: typeof GraphAnnotation.State,
-): "generateLinkedinPost" | typeof END {
+): "generatePosts" | typeof END {
   if (state.report) {
-    return "generateLinkedinPost";
+    return "generatePosts";
   }
   return END;
 }
@@ -68,11 +63,12 @@ const generatePostBuilder = new StateGraph(GraphAnnotation)
   .addNode("verifyGitHubContent", verifyGitHubContent, {
     input: VerifyContentAnnotation,
   })
+  .addNode("verifyTweetSubGraph", verifyTweetGraph, {
+    input: VerifyContentAnnotation,
+  })
 
-  // Generates a post on the content for LinkedIn.
-  .addNode("generateLinkedinPost", generateLinkedinPost)
-  // Generates a post on the content for Twitter.
-  .addNode("generateTwitterPost", generateTwitterPost)
+  // Generates a Tweet/LinkedIn post based on the report content.
+  .addNode("generatePosts", generatePosts)
   // Interrupts the node for human in the loop, then schedules the
   // post for Twitter/LinkedIn.
   .addNode("schedulePost", schedulePost)
@@ -83,24 +79,24 @@ const generatePostBuilder = new StateGraph(GraphAnnotation)
     "verifyYouTubeContent",
     "verifyGeneralContent",
     "verifyGitHubContent",
+    "verifyTweetSubGraph",
   ])
 
   // After verifying the different content types, we should generate a report on them.
   .addEdge("verifyYouTubeContent", "generateContentReport")
   .addEdge("verifyGeneralContent", "generateContentReport")
   .addEdge("verifyGitHubContent", "generateContentReport")
+  .addEdge("verifyTweetSubGraph", "generateContentReport")
 
   // Once generating a report, we should confirm the report exists (meaning the content is relevant).
   .addConditionalEdges("generateContentReport", routeAfterGeneratingReport, [
-    "generateLinkedinPost",
+    "generatePosts",
     END,
   ])
 
-  // After generating the LinkedIn post, generate the Twitter post.
-  .addEdge("generateLinkedinPost", "generateTwitterPost")
   // Finally, schedule the post. This will also throw an interrupt
   // so a human can edit the post before scheduling.
-  .addEdge("generateTwitterPost", "schedulePost")
+  .addEdge("generatePosts", "schedulePost")
 
   // If the schedule post is successful, end the graph.
   .addEdge("schedulePost", END);

diff --git a/src/agent/subgraphs/generate-post/nodes/generate-content-report.ts b/src/agent/subgraphs/generate-post/nodes/generate-content-report.ts
@@ -1,9 +1,66 @@
 import { LangGraphRunnableConfig } from "@langchain/langgraph";
-import { GraphAnnotation } from "../state.js";
+import { GraphAnnotation } from "../generate-post-state.js";
+import { LANGCHAIN_PRODUCTS_CONTEXT } from "../prompts.js";
+import { ChatAnthropic } from "@langchain/anthropic";
+
+const GENERATE_REPORT_PROMPT = `You are a highly regarded marketing employee at LangChain.
+You have been tasked with writing a report summary on content submitted to you from a third party in hopes of having it promoted by LangChain.
+This summary report will then be used to craft Tweets and LinkedIn posts promoting the content and LangChain products.
+LangChain has a policy of promoting any content submitted that uses LangChain's products.
+
+Here is some context about the different LangChain products and services:
+${LANGCHAIN_PRODUCTS_CONTEXT}
+
+Given this context, examine the users input closely, and generate a summary report on it.
+
+The summary report should follow the following structure guidelines:
+<structure guidelines>
+1. The first part of the report should be a high level overview of the content. Include the name, what it does/what it aims to achieve/the problems it solves.
+2. The second part should be all about how it implements LangChain's products/services. Cover what product(s) it uses. How these products are used, and why they're important to the application. This should be technical and detailed. Ensure you clearly state the LangChain product(s) used at the top of this section.
+3. The final part should go into detail covering anything the first two parts missed. This should be a detailed technical overview of the content, and interesting facts you found that readers might find engaging. This part does NOT need to long, and if you've already covered everything, you can skip it. Remember you do NOT want to bore the readers with repetitive information.
+</structure guidelines>
+
+Follow these rules and guidelines when generating the report:
+<rules>
+- Focus on subject of the content, and why/how LangChain's product(s) enhance it.
+- The final Tweet/LinkedIn post will be developer focused, so ensure the report is technical and detailed.
+- Include any relevant links found in the content in the report.
+- Include details about what the product does/what problem it solves.
+- Use proper markdown styling when formatting the report summary.
+- If possible, keep the post at or under 280 characters (not including the URL) for conciseness.
+<rules>
+
+Do not include any personal opinions or biases in the report. Stick to the facts and technical details.
+Your response should ONLY include the report summary, and no other text.`;
+
+const formatReportPrompt = (pageContents: string[]): string => {
+  return `The following text contains summaries, or entire pages from the content I submitted to you. Please review the content and generate a report on it.
+${pageContents.map((content, index) => `<Content index={${index + 1}}>\n${content}\n</Content>`).join("\n\n")}`;
+};
 
 export async function generateContentReport(
-  _state: typeof GraphAnnotation.State,
+  state: typeof GraphAnnotation.State,
   _config: LangGraphRunnableConfig,
 ): Promise<Partial<typeof GraphAnnotation.State>> {
-  throw new Error("Not implemented");
+  const reportModel = new ChatAnthropic({
+    model: "claude-3-5-sonnet-20241022",
+    temperature: 0,
+  });
+
+  const prompt = formatReportPrompt(state.pageContents);
+
+  const result = await reportModel.invoke([
+    {
+      role: "system",
+      content: GENERATE_REPORT_PROMPT,
+    },
+    {
+      role: "user",
+      content: prompt,
+    },
+  ]);
+
+  return {
+    report: result.content as string,
+  };
 }