Skip to content

Commit

Permalink
contrib/tesseract-langs.sh: add script to generate recipes for tesser…
Browse files Browse the repository at this point in the history
…act languages

This script writes language recipes for tesseract.  It downloads the
listing of available languages and language tarballs from the official
site and writes language recipes tesseract-lang-<lang>_<version>.bb
for each language.

Signed-off-by: Mario Domenech Goulart <[email protected]>
Signed-off-by: Martin Jansa <[email protected]>
  • Loading branch information
Mario Domenech Goulart authored and shr-project committed Jun 10, 2014
1 parent fccc8f1 commit cb41796
Showing 1 changed file with 92 additions and 0 deletions.
92 changes: 92 additions & 0 deletions contrib/tesseract-langs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#! /bin/sh

# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)

PV='3.02'

# Sometimes the software package has a minor version, but language
# packages have not. Example:
# software package: tesseract-ocr-3.02.02.tar.gz
# language package: tesseract-ocr-3.02.por.tar.gz
MINOR_PV=02

recipes_dir=$1

usage() {
echo "Usage: `basename $0` <recipes dir> [ <download dir> ]"
}

if [ -z "$recipes_dir" ]; then
usage
exit 1
fi
mkdir -p "$recipes_dir"

file_list_uri='https://code.google.com/p/tesseract-ocr/downloads/list'
file_list=`mktemp`

remove_dl_dir=
if [ -z "$2" ]; then
remove_dl_dir=1
dl_dir=`mktemp -d`
else
dl_dir="$2"
fi

mkdir -p $dl_dir

tesseract_langs() {
wget -q -O "$file_list" "$file_list_uri"

grep -E 'a href="detail\?name=tesseract-ocr-'${PV}'\.[^\.]+.tar.gz&amp;can=2&amp;q=">' "$file_list" | \
sed -r -e 's/.*tesseract-ocr-'${PV}'\.*([^\.]+)\.tar\.gz.*/\1/' | \
grep -Ev '('${MINOR_PV}'|'${MINOR_PV}'-doc-html)' | \
sort -u
}

download_lang_files() {
local langs="$1"
local uri
for lang in $langs; do
if [ ! -e "$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz" ]; then
uri="https://tesseract-ocr.googlecode.com/files/tesseract-ocr-${PV}.${lang}.tar.gz"
echo "Downloading $uri"
wget -q -P "$dl_dir" "$uri"
fi
done
}

create_recipe() {
local lang=$1
local tarball

tarball="$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz"

md5sum=`md5sum $tarball | awk '{print $1}'`
sha256sum=`sha256sum $tarball | awk '{print $1}'`

cat > $recipes_dir/tesseract-lang-`echo ${lang} | sed s/_/-/g`_${PV}.bb <<EOF
# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
TESSERACT_LANG = "$lang"
require tesseract-lang.inc
SRC_URI[md5sum] = "${md5sum}"
SRC_URI[sha256sum] = "${sha256sum}"
EOF
}


LANGS=`tesseract_langs`

download_lang_files "$LANGS"

for lang in $LANGS; do
create_recipe $lang
done

[ -n "$remove_dl_dir" ] && rm -rf $dl_dir
rm -f $file_list

0 comments on commit cb41796

Please sign in to comment.