Skip to content

Commit

Permalink
Merge branch 'main' into fix-vulnerabilities
Browse files Browse the repository at this point in the history
  • Loading branch information
dehoward committed Jul 27, 2023
2 parents f270cdd + 5d29ff1 commit 2c9c143
Show file tree
Hide file tree
Showing 23 changed files with 339 additions and 115 deletions.
9 changes: 0 additions & 9 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,21 +32,12 @@
},
"typescript.updateImportsOnFileMove.enabled": "always",
"eslint.enable": true,
"eslint.validate": [
"javascript",
"javascriptreact",
"typescript",
"typescriptreact"
],
"eslint.lintTask.enable": true,
"eslint.workingDirectories": [
{
"mode": "auto"
}
],
"eslint.options": {
"overrideConfigFile": "./package.json"
},
"files.associations": {
"*.json": "jsonc"
},
Expand Down
47 changes: 30 additions & 17 deletions webapi/CopilotChat/Controllers/DocumentImportController.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
using SemanticKernel.Service.CopilotChat.Skills;
using SemanticKernel.Service.CopilotChat.Storage;
using SemanticKernel.Service.Services;
using Tesseract;
using UglyToad.PdfPig;
using UglyToad.PdfPig.DocumentLayoutAnalysis.TextExtractor;
using static SemanticKernel.Service.CopilotChat.Models.MemorySource;
Expand Down Expand Up @@ -71,33 +70,36 @@ private enum SupportedFileType

private readonly ILogger<DocumentImportController> _logger;
private readonly DocumentMemoryOptions _options;
private readonly OcrSupportOptions _ocrSupportOptions;
private readonly ChatSessionRepository _sessionRepository;
private readonly ChatMemorySourceRepository _sourceRepository;
private readonly ChatMessageRepository _messageRepository;
private readonly ChatParticipantRepository _participantRepository;
private const string GlobalDocumentUploadedClientCall = "GlobalDocumentUploaded";
private const string ReceiveMessageClientCall = "ReceiveMessage";
private readonly ITesseractEngine _tesseractEngine;
private readonly IOcrEngine _ocrEngine;

/// <summary>
/// Initializes a new instance of the <see cref="DocumentImportController"/> class.
/// </summary>
public DocumentImportController(
ILogger<DocumentImportController> logger,
IOptions<DocumentMemoryOptions> documentMemoryOptions,
IOptions<OcrSupportOptions> ocrSupportOptions,
ChatSessionRepository sessionRepository,
ChatMemorySourceRepository sourceRepository,
ChatMessageRepository messageRepository,
ChatParticipantRepository participantRepository,
ITesseractEngine tesseractEngine)
IOcrEngine ocrEngine)
{
this._logger = logger;
this._options = documentMemoryOptions.Value;
this._ocrSupportOptions = ocrSupportOptions.Value;
this._sessionRepository = sessionRepository;
this._sourceRepository = sourceRepository;
this._messageRepository = messageRepository;
this._participantRepository = participantRepository;
this._tesseractEngine = tesseractEngine;
this._ocrEngine = ocrEngine;
}

/// <summary>
Expand Down Expand Up @@ -259,8 +261,28 @@ private async Task ValidateDocumentImportFormAsync(DocumentImportForm documentIm
}

// Make sure the file type is supported.
// GetFileType throws ArgumentOutOfRangeException if the file type is not supported.
this.GetFileType(Path.GetFileName(formFile.FileName));
var fileType = this.GetFileType(Path.GetFileName(formFile.FileName));
switch (fileType)
{
case SupportedFileType.Txt:
case SupportedFileType.Pdf:
break;
case SupportedFileType.Jpg:
case SupportedFileType.Png:
case SupportedFileType.Tiff:
{
if (this._ocrSupportOptions.Type != OcrSupportOptions.OcrSupportType.None)
{
break;
}

throw new ArgumentException($"Unsupported image file type: {fileType} when " +
$"{OcrSupportOptions.PropertyName}:{nameof(OcrSupportOptions.Type)} is set to " +
nameof(OcrSupportOptions.OcrSupportType.None));
}
default:
throw new ArgumentException($"Unsupported file type: {fileType}");
}
}
}

Expand Down Expand Up @@ -455,17 +477,8 @@ private SupportedFileType GetFileType(string fileName)
/// <returns>A string of the content of the file.</returns>
private async Task<string> ReadTextFromImageFileAsync(IFormFile file)
{
await using (var ms = new MemoryStream())
{
await file.CopyToAsync(ms);
var fileBytes = ms.ToArray();
await using var imgStream = new MemoryStream(fileBytes);

using var img = Pix.LoadFromMemory(imgStream.ToArray());

using var page = this._tesseractEngine.Process(img);
return page.GetText();
}
var textFromFile = await this._ocrEngine.ReadTextFromImageFileAsync(file);
return textFromFile;
}

/// <summary>
Expand Down
12 changes: 8 additions & 4 deletions webapi/CopilotChat/Extensions/ServiceExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.IO;
using System.Reflection;
using Azure;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Options;
Expand Down Expand Up @@ -89,18 +90,21 @@ public static IServiceCollection AddPersistentOcrSupport(this IServiceCollection

switch (ocrSupportConfig.Type)
{
case OcrSupportOptions.OcrSupportType.AzureFormRecognizer:
{
services.AddSingleton<IOcrEngine>(sp => new AzureFormRecognizerOcrEngine(ocrSupportConfig.AzureFormRecognizer!.Endpoint!, new AzureKeyCredential(ocrSupportConfig.AzureFormRecognizer!.Key!)));
break;
}
case OcrSupportOptions.OcrSupportType.Tesseract:
{
services.AddSingleton<ITesseractEngine>(sp => new TesseractEngineWrapper(new TesseractEngine(ocrSupportConfig.Tesseract!.FilePath, ocrSupportConfig.Tesseract!.Language, EngineMode.Default)));
services.AddSingleton<IOcrEngine>(sp => new TesseractEngineWrapper(new TesseractEngine(ocrSupportConfig.Tesseract!.FilePath, ocrSupportConfig.Tesseract!.Language, EngineMode.Default)));
break;
}

case OcrSupportOptions.OcrSupportType.None:
{
services.AddSingleton<ITesseractEngine>(sp => new NullTesseractEngine());
services.AddSingleton<IOcrEngine>(sp => new NullOcrEngine());
break;
}

default:
{
throw new InvalidOperationException($"Unsupported OcrSupport:Type '{ocrSupportConfig.Type}'");
Expand Down
26 changes: 26 additions & 0 deletions webapi/CopilotChat/Options/AzureFormRecognizerOptions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;
using SemanticKernel.Service.Options;

namespace SemanticKernel.Service.CopilotChat.Options;

/// <summary>
/// Configuration options for Azure Form Recognizer OCR support.
/// </summary>
public sealed class AzureFormRecognizerOptions
{
public const string PropertyName = "AzureFormRecognizer";

/// <summary>
/// The endpoint for accessing a provisioned Azure Form Recognizer instance
/// </summary>
[Required, NotEmptyOrWhitespace]
public string Endpoint { get; set; } = string.Empty;

/// <summary>
/// The provisioned Azure Form Recognizer access key
/// </summary>
[Required, NotEmptyOrWhitespace]
public string Key { get; set; } = string.Empty;
}
15 changes: 13 additions & 2 deletions webapi/CopilotChat/Options/OcrSupportOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,12 @@ public enum OcrSupportType
/// <summary>
/// Tesseract OCR Support
/// </summary>
Tesseract
Tesseract,

/// <summary>
/// Azure Form Recognizer OCR Support
/// </summary>
AzureFormRecognizer
}

/// <summary>
Expand All @@ -30,8 +35,14 @@ public enum OcrSupportType
public OcrSupportType Type { get; set; } = OcrSupportType.None;

/// <summary>
/// Gets or sets the configuration for the Tesseract OCR support.
/// Gets or sets the configuration for Tesseract OCR support.
/// </summary>
[RequiredOnPropertyValue(nameof(Type), OcrSupportType.Tesseract)]
public TesseractOptions? Tesseract { get; set; }

/// <summary>
/// Gets or sets the configuration for Azure Form Recognizer OCR support.
/// </summary>
[RequiredOnPropertyValue(nameof(Type), OcrSupportType.AzureFormRecognizer)]
public AzureFormRecognizerOptions? AzureFormRecognizer { get; set; }
}
1 change: 1 addition & 0 deletions webapi/CopilotChatWebApi.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.FormRecognizer" Version="4.0.0" />
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.35.2" />
<PackageReference Include="Microsoft.SemanticKernel" Version="0.17.230718.1-preview" />
<PackageReference Include="Microsoft.SemanticKernel.Connectors.AI.OpenAI" Version="0.17.230718.1-preview" />
Expand Down
9 changes: 6 additions & 3 deletions webapi/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,12 @@ Before you get started, make sure you have the following requirements in place:

> To clean your system of the developer certificate, run `dotnet run dev-certs https --clean`

5. **(Optional)** To enable support for uploading image file formats such as png, jpg and tiff, we have included the [Tesseract](https://www.nuget.org/packages/Tesseract) nuget package.
- You will need to obtain one or more [tessdata language data files](https://github.com/tesseract-ocr/tessdata) such as `eng.traineddata` and add them to your `./data` directory or the location specified in the `Tesseract.FilePath` location in `./appsettings.json`.
- Set the `Copy to Output Directory` value to `Copy if newer`.
5. **(Optional)** To enable support for uploading image file formats such as png, jpg and tiff, there are two options within the `OcrSupport` section of `./appsettings.json`, the Tesseract open source library and Azure Form Recognizer.
- **Tesseract** we have included the [Tesseract](https://www.nuget.org/packages/Tesseract) nuget package.
- You will need to obtain one or more [tessdata language data files](https://github.com/tesseract-ocr/tessdata) such as `eng.traineddata` and add them to your `./data` directory or the location specified in the `OcrSupport:Tesseract:FilePath` location in `./appsettings.json`.
- Set the `Copy to Output Directory` value to `Copy if newer`.
- **Azure Form Recognizer** we have included the [Azure.AI.FormRecognizer](https://www.nuget.org/packages/Azure.AI.FormRecognizer) nuget package.
- You will need to obtain an [Azure Form Recognizer](https://azure.microsoft.com/en-us/services/form-recognizer/) resource and add the `OcrSupport:AzureFormRecognizer:Endpoint` and `OcrSupport:AzureFormRecognizer:Key` values to the `./appsettings.json` file.
# Start the WebApi Service
Expand Down
59 changes: 59 additions & 0 deletions webapi/Services/AzureFormRecognizerOcrEngine.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Azure;
using Azure.AI.FormRecognizer;
using Azure.AI.FormRecognizer.Models;
using Microsoft.AspNetCore.Http;

namespace SemanticKernel.Service.Services;

/// <summary>
/// Wrapper for the Azure.AI.FormRecognizer. This allows Form Recognizer to be used as the OCR engine for reading text from files with an image MIME type.
/// </summary>
public class AzureFormRecognizerOcrEngine : IOcrEngine
{
/// <summary>
/// Creates a new instance of the AzureFormRecognizerOcrEngine passing in the Form Recognizer endpoint and key.
/// </summary>
/// <param name="endpoint">The endpoint for accessing a provisioned Azure Form Recognizer instance</param>
/// <param name="credential">The AzureKeyCredential containing the provisioned Azure Form Recognizer access key</param>
public AzureFormRecognizerOcrEngine(string endpoint, AzureKeyCredential credential)
{
this.FormRecognizerClient = new FormRecognizerClient(new Uri(endpoint), credential);
}

public FormRecognizerClient FormRecognizerClient { get; }

///<inheritdoc/>
public async Task<string> ReadTextFromImageFileAsync(IFormFile imageFile)
{
await using (var imgStream = new MemoryStream())
{
await imageFile.CopyToAsync(imgStream);
imgStream.Position = 0;

// Start the OCR operation
RecognizeContentOperation operation = await this.FormRecognizerClient.StartRecognizeContentAsync(imgStream);

// Wait for the result
Response<FormPageCollection> operationResponse = await operation.WaitForCompletionAsync();
FormPageCollection formPages = operationResponse.Value;

StringBuilder text = new();
foreach (FormPage page in formPages)
{
foreach (FormLine line in page.Lines)
{
string lineText = string.Join(" ", line.Words.Select(word => word.Text));
text.AppendLine(lineText);
}
}
return text.ToString();
}
}
}
20 changes: 20 additions & 0 deletions webapi/Services/IOcrEngine.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;

namespace SemanticKernel.Service.Services;

/// <summary>
/// An OCR engine that can read in text from image MIME type files.
/// </summary>
public interface IOcrEngine
{

/// <summary>
/// Reads all text from the image file.
/// </summary>
/// <param name="imageFile">A file that is expected to be an image MIME type</param>
/// <returns></returns>
Task<string> ReadTextFromImageFileAsync(IFormFile imageFile);
}
26 changes: 0 additions & 26 deletions webapi/Services/ITesseractEngine.cs

This file was deleted.

24 changes: 24 additions & 0 deletions webapi/Services/NullOcrEngine.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (c) Microsoft. All rights reserved.

using System;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Http;

namespace SemanticKernel.Service.Services;

/// <summary>
/// Used as a placeholder implementation when "none" is set in the OcrSupport:Type field in the configuration.
/// </summary>
public class NullOcrEngine : IOcrEngine
{
/// <summary>
/// Throws an exception to let the user know they need to specify a valid OcrSupport type in order to use the image upload feature.
/// </summary>
/// <param name="imageFile">Not used</param>
/// <returns>This will always throw a NotImplementedException</returns>
/// <exception cref="NotImplementedException"></exception>
public Task<string> ReadTextFromImageFileAsync(IFormFile imageFile)
{
throw new NotImplementedException("You must specify a \"Type\" other than \"none\" within the \"OcrSupport\" application settings to use the image upload feature. See the README.md");
}
}
23 changes: 0 additions & 23 deletions webapi/Services/NullTesseractEngine.cs

This file was deleted.

Loading

0 comments on commit 2c9c143

Please sign in to comment.