feat: add support to retrieve images generated by ImageFX, update readme

HanaokaYuzu · Feb 12, 2024 · df74f4a · df74f4a
1 parent 2069d62
commit df74f4a
Show file tree

Hide file tree

Showing 3 changed files with 150 additions and 31 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,17 @@
+<p align="center">
+    <img src="https://www.gstatic.com/lamda/images/gemini_wordmark_landing_page_238102af073d0ae2763aa5.svg" alt="Gemini Workmark" align="center">
+</p>
+
 # <img src="https://www.gstatic.com/lamda/images/favicon_v1_150160cddff7f294ce30.svg" width="35px" alt="Gemini Icon" /> Gemini-API
 
-Reverse-engineered asynchronous python wrapper for Google Gemini (formerly Bard) web client, providing a simple and elegant interface inspired by Google GenerativeAI's official API.
+A reverse-engineered asynchronous python wrapper for [Google Gemini](https://gemini.google.com) (formerly Bard).
+
+## Features
+
+- **ImageFx Support** - Supports retrieving images generated by ImageFx, Google's latest AI image generator.
+- **Classified Outputs** - Automatically categorizes texts, web images and AI generated images from the response.
+- **Official Flavor** - Provides a simple and elegant interface inspired by [Google Generative AI](https://ai.google.dev/tutorials/python_quickstart)'s official API.
+- **Asynchronous** - Utilizes `asyncio` to run generating tasks and return outputs efficiently.
 
 ## Installation
 
@@ -10,60 +21,113 @@ pip install gemini-webapi
 
 ## Authentication
 
-- Go to <https://gemini.google.com/> and login with your Google account
+- Go to <https://gemini.google.com> and login with your Google account
 - Press F12 for web inspector, go to `Network` tab and refresh the page
 - Click any request and copy cookie values of `__Secure-1PSID` and `__Secure-1PSIDTS`
 
+Note: `__Secure-1PSIDTS` could get expired frequently if the Google account is actively used elsewhere, especially when visiting <https://gemini.google.com> directly. It's recommended to use a separate Google account if you are builing a keep-alive service with this package.
+
 ## Usage
 
 ### Initialization
 
+Import required packages and initialize a client with your cookies obtained from the previous step.
+
 ```python
+import asyncio
 from gemini import GeminiClient
 
-# Replace "COOKIE VALUE HERE" with your actual cookie values as strings
+# Replace "COOKIE VALUE HERE" with your actual cookie values
 Secure_1PSID = "COOKIE VALUE HERE"
 Secure_1PSIDTS = "COOKIE VALUE HERE"
 
-client = GeminiClient(Secure_1PSID, Secure_1PSIDTS, proxy=None)
-await client.init()
+async def main():
+    client = GeminiClient(Secure_1PSID, Secure_1PSIDTS, proxy=None)
+    await client.init(timeout=30)
+
+asyncio.run(main())
 ```
 
 ### Generate contents from text inputs
 
+Ask a one-turn quick question by calling `GeminiClient.generate_content`.
+
+```python
+async def main():
+    response = await client.generate_content("Hello World!")
+    print(response.text)
+
+asyncio.run(main())
+```
+
+Note: simply use `print(response)` to get the same output if you just want to see the response text
+
+### Conversations across multiple turns
+
+If you want to keep conversation continuous, please use `GeminiClient.start_chat` to create a `ChatSession` object and send messages through it. The conversation history will be automatically handled and get updated after each turn.
+
 ```python
-response = await client.generate_content("Hello World!")
-print(response.text)  # Note: simply use print(response) to get the same output if you just want to see the response text
+async def main():
+    chat = client.start_chat()
+    response1 = await chat.send_message("Briefly introduce Europe")
+    response2 = await chat.send_message("What's the population there?")
+    print(response1.text, response2.text, sep="\n\n----------------------------------\n\n")
+
+asyncio.run(main())
 ```
 
 ### Retrieve images in response
 
+Images in the API's output are stored as a list of `Image` objects. You can access the image title, URL, and description by calling `image.title`, `image.url` and `image.alt` respectively.
+
 ```python
-response = await client.generate_content("Send me some pictures of cats")
-images = response.images
-for image in images:
-    print(f"{image.title}({image.url}) - {image.alt}", sep="\n")
+async def main():
+    response = await client.generate_content("Send me some pictures of cats")
+    images = response.images
+    for image in images:
+        print(image, "\n\n----------------------------------\n")
+
+asyncio.run(main())
 ```
 
-### Conversations across multiple turns
+### Generate image with ImageFx
+
+In February 2022, Google introduced a new AI image generator called ImageFx and integrated it into Gemini. You can ask Gemini to generate images with ImageFx simply by natural language.
 
 ```python
-chat = client.start_chat()  # A chat stores the metadata to keep a conversation continuous. It will automatically get updated after each turn
-response1 = await chat.send_message("Briefly introduce Europe")
-response2 = await chat.send_message("What's the population there?")
-print(response1.text, response2.text, sep="\n----------------------------------\n")
+async def main():
+    response = await client.generate_content("Generate some pictures of cats")
+    images = response.images
+    for image in images:
+        print(image, "\n\n----------------------------------\n")
+
+asyncio.run(main())
 ```
 
+Note: by default, when asked to send images (like the previous example), Gemini will send images fetched from web instead of generating images with AI model, unless you specifically require to "generate" images in your prompt. In this package, web images and generated images are treated differently as `WebImage` and `GeneratedImage`, and will be automatically categorized in the output.
+
 ### Check and switch to other answer candidates
 
+A response from Gemini usually contains multiple reply candidates with different generated contents. You can check all candidates and choose one to continue the conversation. By default, the first candidate will be chosen automatically.
+
 ```python
-chat = client.start_chat()
-response = await chat.send_message("What's the best Japanese dish in your mind? Choose one only.")
-for candidate in response.candidates:
-    print(candidate, "\n----------------------------------\n")
-
-# Control the ongoing conversation flow by choosing candidate manually
-new_candidate = chat.choose_candidate(index=1) # Choose the second candidate here
-followup_response = await chat.send_message("Tell me more about it.")  # Will generate contents based on the chosen candidate
-print(new_candidate, followup_response, sep="\n----------------------------------\n")
+async def main():
+    # Start a conversation and list all reply candidates
+    chat = client.start_chat()
+    response = await chat.send_message("What's the best Japanese dish? Recommend one only.")
+    for candidate in response.candidates:
+        print(candidate, "\n\n----------------------------------\n")
+
+    # Control the ongoing conversation flow by choosing candidate manually
+    new_candidate = chat.choose_candidate(index=1)  # Choose the second candidate here
+    followup_response = await chat.send_message("Tell me more about it.")  # Will generate contents based on the chosen candidate
+    print(new_candidate, followup_response, sep="\n\n----------------------------------\n\n")
+
+asyncio.run(main())
 ```
+
+## References
+
+[Google AI Studio](https://ai.google.dev/tutorials/ai-studio_quickstart)
+
+[acheong08/Bard](https://github.com/acheong08/Bard)
diff --git a/src/gemini/client.py b/src/gemini/client.py
@@ -9,7 +9,8 @@
 
 from .consts import HEADERS
 from .types import (
-    Image,
+    WebImage,
+    GeneratedImage,
     Candidate,
     ModelOutput,
     AuthError,
@@ -121,7 +122,7 @@ async def init(
                     logger.success("Gemini client initiated successfully.")
                 else:
                     raise AuthError(
-                        "Failed to initiate client. SNlM0e not found in response, make sure cookie values are valid."
+                        "Failed to initiate client. SECURE_1PSIDTS could get expired frequently, please make sure cookie values are up to date."
                     )
 
             self.auto_close = auto_close
@@ -195,6 +196,7 @@ async def generate_content(
             )
 
         if response.status_code != 200:
+            self.running = False
             raise APIError(
                 f"Failed to generate contents. Request failed with status code {response.status_code}"
             )
@@ -203,16 +205,33 @@ async def generate_content(
 
             candidates = []
             for candidate in body[4]:
-                images = (
+                web_images = (
                     candidate[4]
                     and [
-                        Image(url=image[0][0][0], title=image[2], alt=image[0][4])
+                        WebImage(url=image[0][0][0], title=image[2], alt=image[0][4])
                         for image in candidate[4]
                     ]
                     or []
                 )
+                generated_images = (
+                    candidate[12]
+                    and candidate[12][7]
+                    and candidate[12][7][0]
+                    and [
+                        GeneratedImage(
+                            url=image[0][3][3], title=f"[Generated Image {image[3][6]}]", alt=image[3][5][i]
+                        )
+                        for i, image in enumerate(candidate[12][7][0])
+                    ]
+                    or []
+                )
                 candidates.append(
-                    Candidate(rcid=candidate[0], text=candidate[1][0], images=images)
+                    Candidate(
+                        rcid=candidate[0],
+                        text=candidate[1][0],
+                        web_images=web_images,
+                        generated_images=generated_images,
+                    )
                 )
             if not candidates:
                 raise GeminiError(

diff --git a/src/gemini/types.py b/src/gemini/types.py
@@ -26,17 +26,53 @@ def __repr__(self):
         return f"""Image(title='{self.title}', url='{len(self.url) <= 20 and self.url or self.url[:8] + '...' + self.url[-12:]}', alt='{self.alt}')"""
 
 
+class WebImage(Image):
+    """
+    Image retrieved from web. Returned when ask Gemini to "SEND an image of [something]".
+    """
+
+    pass
+
+
+class GeneratedImage(Image):
+    """
+    Image generated by ImageFX, Google's AI image generator. Returned when ask Gemini to "GENERATE an image of [something]".
+    """
+
+    pass
+
+
 class Candidate(BaseModel):
+    """
+    A single reply candidate object in the model output. A full response from Gemini usually contains multiple reply candidates.
+
+    Parameters
+    ----------
+    rcid: `str`
+        Reply candidate ID to build the metadata
+    text: `str`
+        Text output
+    web_images: `list[WebImage]`, optional
+        List of web images in reply, can be empty.
+    generated_images: `list[GeneratedImage]`, optional
+        List of generated images in reply, can be empty
+    """
+
     rcid: str
     text: str
-    images: list[Image]
+    web_images: list[WebImage] = []
+    generated_images: list[GeneratedImage] = []
 
     def __str__(self):
         return self.text
 
     def __repr__(self):
         return f"Candidate(rcid='{self.rcid}', text='{len(self.text) <= 20 and self.text or self.text[:20] + '...'}', images={self.images})"
 
+    @property
+    def images(self) -> list[Image]:
+        return self.web_images + self.generated_images
+
 
 class ModelOutput(BaseModel):
     """