From 973a8fb124c96f8db742fdbfefb2fb8c1b250560 Mon Sep 17 00:00:00 2001 From: xf0e Date: Thu, 3 Mar 2022 15:19:58 +0000 Subject: [PATCH] mostly CGL und error handling (#45) * [CGL] corrected comment on CheckForAcceptRequest * [CODING] all incoming request will (even erroneous) will get and log corresponding requestID. [CODING] resource manager now additionally relies on onw size of request queues. Should avoid cases there someone places many request at once * [CODING] more error logs now contains requestID for trouble shooting * [CODING] investigating code for deadlock if multiple requests are placed at same time. do not use in prod * [DEBUGGING] further investigation of a possible deadlock in addNewRequest * [BUGFIX] further investigation of a hang bug * [CODING] /ocr-status will now return a valid JSON with status "not found" and CODE 200 instead of 404 Status code * [CGL] RequestID will now be shown in the logs instead of requestID * [BUILD] removed netgo build flag from Makefile * Makefile now builds static linked executables * [BUGFIX] further investigation of a hang bug * [BUGFIX] further investigation of a hang bug, removed mutex l * [BUGFIX] further investigation of a hang bug, removed mutex l * [BUGFIX] further investigation of a hang bug, removed mutex l * [BUGFIX] further investigation of a hang bug, removed mutex l * [BUGFIX] further investigation of a hang bug, removed mutex l * [BUGFIX] further working on fixing hang bug on many simultaneous requests * [BUGFIX] further working on fixing hang bug on many simultaneous requests * [BUGFIX] further working on fixing hang bug on many simultaneous requests * [BUGFIX] corrected detection of an invalid reply_to address * [BUGFIX] fixed a bug there a deferred request with reply_to not set was returned without of request ID, so the requester didn't know which request to ask for * [BUGFIX] fixed a bug there the deferred requests were still tracked till timeout even if client hat successful downloaded them already * [BUGFIX] fixed race conditions on request counter and res manager [TODO] fix goroutine leak at [chan send, 3 minutes] ocr_rpc_client.go:221 * [BUGFIX] fixed race conditions request counter * [CGL] fixed comments * [CODING] added todo for fixing leaking go routines * [BUGFIX] go routines are not leaking anymore. There now a bug if "deferred": true, eply_to":"" are not set. The in-flight request queue won't be cleaned up for those requests. ocr_resultorage:72 needs to be considered * [CODING] better logging upon shutdown signal * [CODING] working on proper timeout cancel * [CODING] working on proper timeout cancel * [CODING] better logging in status handler * [CODING] correct handling of goroutines with replyto not set and deferred is true * [CGL] just some CGL * [CODING] go mod tidy * [CODING] updated dependency * [FEATURE] if the first tiff to pdf converter in sandwich engine fails, the second one will be used in order process the request * [CODING] removed unneeded logging * [BUGFIX] proper file name on converter switch * [CODING] flag result_optimize is less aggressive. for gs the dCompatibilityLevel level is now 1.7 and dPDFSETTINGS=/prepress. This will result in bigger pdf with more quality * [CODING] Update dependencies for security reason [CODING] Transition from streadway/amqp to rabbitmq/amqp091-go * [CODING ] error handling on os.Remove in preprocessor_rpc_worker.go * [CGL, BUGFIXES] error handling on os.Remove and grammar spelling * [BUGFIXES] error handling on os.Remove Co-authored-by: Artem Mil --- README.md | 8 +- convert-pdf.go | 14 +- docs/idea_post_ocr_base64.http | 4 +- go.mod | 25 +- go.sum | 509 ++++++++++++++++++--------------- ocr_http_handler.go | 9 +- ocr_http_multipart_handler.go | 7 +- ocr_postback_client.go | 9 +- ocr_rpc_client.go | 16 +- ocr_rpc_worker.go | 4 +- ocr_util.go | 35 ++- preprocessor_rpc_worker.go | 16 +- prometheus_metrics.go | 2 +- rabbit_config.go | 2 +- stroke_width_transform.go | 14 +- tesseract_engine.go | 20 +- 16 files changed, 423 insertions(+), 271 deletions(-) diff --git a/README.md b/README.md index ae5c4bd..307cdc2 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,11 @@ The ip address `10.0.2.15` will be used as the `RABBITMQ_HOST` env variable belo * [Install docker-compose](https://docs.docker.com/compose/) * `git clone https://github.com/tleyden/open-ocr.git` * `cd open-ocr/docker-compose` - * Type ```./run.sh ``` (in case you don't have execute right type ```sudo chmod +x run.sh``` + * Type ```./run.sh ``` (in case you don't have "execute" right type ```sudo chmod +x run.sh``` * The runner will ask you if you want to delete the images (choose y or n for each) * The runner will ask you to choose between version 1 and 2 - * Version 1 is using the ocr Tesseract 3.04. The memory usage is light. It is pretty fast and not costly in term of size (a simple aws instance with 1GB of ram and 8GB of storage is sufficiant). Result are acceptable - * Version 2 is using the ocr Tesseract 4.00. The memory usage is light. It is less fast than tesseract 3 and more costly in term of size (an simple aws instance with 1GB of ram is sufficient but with an EBS of 16GB of storage). Result are really better compared to version 3.04. + * Version 1 is using the ocr Tesseract 3.04. The memory usage is light. It is pretty fast and not costly in terms of size (a simple aws instance with 1GB of ram and 8GB of storage is sufficiant). Result are acceptable + * Version 2 is using the ocr Tesseract 4.00. The memory usage is light. It is less fast than tesseract 3 and more costly in terms of size (a simple aws instance with 1GB of ram is sufficient but with an EBS of 16GB of storage). Result are really better compared to version 3.04. * To see a comparative you can have a look to the [official page of tesseract](https://github.com/tesseract-ocr/tesseract/wiki/4.0-Accuracy-and-Performance) @@ -179,7 +179,7 @@ $ curl -X POST -H "Content-Type: application/json" -d '{"img_base64":" 32 { bodyLenToLog = 32 } diff --git a/ocr_rpc_worker.go b/ocr_rpc_worker.go index bb113e5..e280c47 100644 --- a/ocr_rpc_worker.go +++ b/ocr_rpc_worker.go @@ -3,6 +3,7 @@ package ocrworker import ( "encoding/json" "fmt" + amqp "github.com/rabbitmq/amqp091-go" "net/url" "os" "time" @@ -10,7 +11,6 @@ import ( "github.com/rs/zerolog" "github.com/rs/zerolog/log" "github.com/segmentio/ksuid" - "github.com/streadway/amqp" ) type OcrRpcWorker struct { @@ -22,7 +22,7 @@ type OcrRpcWorker struct { } var ( - // tag is based on ksuid K-Sortable Globally Unique IDs + // tag is based on K-Sortable Globally Unique IDs tag = ksuid.New().String() ) diff --git a/ocr_util.go b/ocr_util.go index 814e3f9..2da8f8b 100644 --- a/ocr_util.go +++ b/ocr_util.go @@ -22,26 +22,34 @@ func saveUrlContentToFileName(uri, tmpFileName string) error { if err != nil { return err } + defer func(outFile *os.File) { + err := outFile.Close() + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(outFile.Name() + " could not be closed") + } + }(outFile) resp, err := http.Get(uri) if err != nil { - outFile.Close() return err } - - defer resp.Body.Close() + defer func(Body io.ReadCloser) { + err := Body.Close() + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(resp.Request.RequestURI + " request Body could not be closed") + } + }(resp.Body) if resp.StatusCode != http.StatusOK { - outFile.Close() return err } _, err = io.Copy(outFile, resp.Body) if err != nil { - outFile.Close() + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(outFile.Name() + " could not be written") return err } - return outFile.Close() + return err } func saveBytesToFileName(bytes []byte, tmpFileName string) error { @@ -56,10 +64,16 @@ func url2bytes(uri string) ([]byte, error) { return nil, err } - defer resp.Body.Close() + defer func(Body io.ReadCloser) { + err := Body.Close() + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(resp.Request.RequestURI + " could not be closed") + } + }(resp.Body) bodyBytes, err := ioutil.ReadAll(resp.Body) if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg("request from " + resp.Request.RequestURI + " could not be read") return nil, err } @@ -86,7 +100,12 @@ func readFirstBytes(filePath string, nBytesToRead uint) ([]byte, error) { if err != nil { return nil, err } - defer file.Close() + defer func(file *os.File) { + err := file.Close() + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(file.Name() + " could not be closed") + } + }(file) buffer := make([]byte, nBytesToRead) _, err = file.Read(buffer) diff --git a/preprocessor_rpc_worker.go b/preprocessor_rpc_worker.go index a8e786a..63c93ff 100644 --- a/preprocessor_rpc_worker.go +++ b/preprocessor_rpc_worker.go @@ -10,7 +10,7 @@ import ( "github.com/rs/zerolog/log" "github.com/segmentio/ksuid" - "github.com/streadway/amqp" + amqp "github.com/rabbitmq/amqp091-go" ) type PreprocessorRpcWorker struct { @@ -193,13 +193,23 @@ func (w *PreprocessorRpcWorker) strokeWidthTransform(ocrRequest *OcrRequest) err if err != nil { return err } - defer os.Remove(tmpFileNameInput) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Str("component", "PREPROCESSOR_WORKER").Msg(name + " could not be removed") + } + }(tmpFileNameInput) tmpFileNameOutput, err := createTempFileName("") if err != nil { return err } - defer os.Remove(tmpFileNameOutput) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Str("component", "PREPROCESSOR_WORKER").Msg(name + " could not be removed") + } + }(tmpFileNameOutput) err = saveBytesToFileName(ocrRequest.ImgBytes, tmpFileNameInput) if err != nil { diff --git a/prometheus_metrics.go b/prometheus_metrics.go index 876bef4..aba9b5e 100644 --- a/prometheus_metrics.go +++ b/prometheus_metrics.go @@ -45,7 +45,7 @@ var ( // InstrumentHttpStatusHandler wraps httpHandler to provide prometheus metrics func InstrumentHttpStatusHandler(ocrHttpHandler *OcrHTTPStatusHandler) http.Handler { - // Register all of the metrics in the standard registry. + // Register all the metrics in the standard registry. prometheus.MustRegister(inFlightGauge, counter, duration, requestSize) ocrChain := promhttp.InstrumentHandlerInFlight(inFlightGauge, diff --git a/rabbit_config.go b/rabbit_config.go index 0010e35..0413228 100644 --- a/rabbit_config.go +++ b/rabbit_config.go @@ -23,7 +23,7 @@ type RabbitConfig struct { /* ResponseCacheTimeout sets default(!!!) global timeout in seconds for request engine will be killed after reaching the time limit, user will get timeout error */ ResponseCacheTimeout uint - // MaximalResponseCacheTimeout client won't be able set the ResponseCacheTimeout higher of it's value + // MaximalResponseCacheTimeout client won't be able to set the ResponseCacheTimeout higher of it's value MaximalResponseCacheTimeout uint FactorForMessageAccept uint } diff --git a/stroke_width_transform.go b/stroke_width_transform.go index f0ae51c..1e4c3c7 100644 --- a/stroke_width_transform.go +++ b/stroke_width_transform.go @@ -21,14 +21,24 @@ func (s StrokeWidthTransformer) preprocess(ocrRequest *OcrRequest) error { if err != nil { return err } - defer os.Remove(tmpFileNameInput) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(tmpFileNameInput + " could not be removed") + } + }(tmpFileNameInput) tmpFileNameOutput, err := createTempFileName("") tmpFileNameOutput = fmt.Sprintf("%s.png", tmpFileNameOutput) if err != nil { return err } - defer os.Remove(tmpFileNameOutput) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Caller().Str("component", "OCR_UTIL").Msg(tmpFileNameOutput + " could not be removed") + } + }(tmpFileNameOutput) err = saveBytesToFileName(ocrRequest.ImgBytes, tmpFileNameInput) if err != nil { diff --git a/tesseract_engine.go b/tesseract_engine.go index f9853b7..e90ba92 100644 --- a/tesseract_engine.go +++ b/tesseract_engine.go @@ -10,7 +10,7 @@ import ( "github.com/rs/zerolog/log" ) -// This variant of the TesseractEngine calls tesseract via exec +// TesseractEngine calls tesseract via exec type TesseractEngine struct { } @@ -77,7 +77,7 @@ func NewTesseractEngineArgs(ocrRequest *OcrRequest) (*TesseractEngineArgs, error } -// return a slice that can be passed to tesseract binary as command line +// Export return a slice that can be passed to tesseract binary as command line // args, eg, ["-c", "tessedit_char_whitelist=0123456789", "-c", "foo=bar"] func (t TesseractEngineArgs) Export() []string { var result []string @@ -97,7 +97,7 @@ func (t TesseractEngineArgs) Export() []string { } // ProcessRequest will process incoming OCR request by routing it through the whole process chain -func (t TesseractEngine) ProcessRequest(ocrRequest *OcrRequest, workerConfig *WorkerConfig) (OcrResult, error) { +func (t TesseractEngine) ProcessRequest(ocrRequest *OcrRequest, _ *WorkerConfig) (OcrResult, error) { tmpFileName, err := func() (string, error) { switch { @@ -122,7 +122,12 @@ func (t TesseractEngine) ProcessRequest(ocrRequest *OcrRequest, workerConfig *Wo } if engineArgs.saveFiles { - defer os.Remove(tmpFileName) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Str("component", "OCR_TESSERACT").Caller().Msg(name + " could not be removed") + } + }(tmpFileName) } ocrResult, err := t.processImageFile(tmpFileName, *engineArgs) @@ -223,7 +228,12 @@ func (t TesseractEngine) processImageFile(inputFilename string, engineArgs Tesse // delete output file when we are done if engineArgs.saveFiles { - defer os.Remove(outFile) + defer func(name string) { + err := os.Remove(name) + if err != nil { + log.Warn().Err(err).Str("component", "OCR_TESSERACT").Caller().Msg(name + " could not be removed") + } + }(outFile) } if err != nil { log.Error().Err(err).Str("component", "OCR_TESSERACT").