-
Notifications
You must be signed in to change notification settings - Fork 2
/
match.sh
executable file
·97 lines (87 loc) · 3.1 KB
/
match.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#! /bin/bash
# ./match.sh NO 2018 2019 | more
# ./match.sh PL 1980 2020 title | more
# exit on the first error
set -e
# The environment variable COUNT_PDFS (set externally) determines
# whether we count all matching patents, or only those where we have
# downloaded the PDF files.
COUNTRY=${1:-HR}
YEAR_START=${2:-1980}
YEAR_END=${3:-${2:-2020}}
FIELD=${4}
SCRIPTDIR="${HOME}/tmp"
FAMILYDIR="${FAMILYDIR:-/fs/bil0/europat/family}"
INFODIR="${INFODIR:-/data/patents/pdfpatents}"
# "catch exit status 1" grep wrapper
c1grep() { grep "$@" || test $? = 1; }
for (( MATCHYEAR="${YEAR_START}"; MATCHYEAR<="${YEAR_END}"; MATCHYEAR++ ))
do
MATCHFILE="${SCRIPTDIR}/${COUNTRY}-${MATCHYEAR}-matched.txt"
if [ ! -s "${MATCHFILE}" ]; then
TABFILE="${FAMILYDIR}/EN-${COUNTRY}/EN-${COUNTRY}-${MATCHYEAR}-FamilyID.tab"
if [ -f "${TABFILE}" ]; then
cut -f3 "${TABFILE}" | sed s/-/./g > "${MATCHFILE}"
fi
fi
done
YEARS="year"
TOTALS="total"
SINGLEFIELDPATTERN='^[^[:space:]]+'
FIELDPATTERN='^[^[:space:]]+\t[^[:space:]]+\t[^[:space:]]+\t[^[:space:]]+'
PDFPATTERN='\t[^[:space:]]+$'
for (( YEAR="${YEAR_START}"; YEAR<="${YEAR_END}"; YEAR++ ))
do
COUNTS=""
YEARDIR="${INFODIR}/${COUNTRY}-${YEAR}"
INFOFILE=`ls "${YEARDIR}"/*-info.txt`
# iterate through match years in reverse order
for (( MATCHYEAR="${YEAR_END}"; MATCHYEAR>="${YEAR_START}"; MATCHYEAR-- ))
do
MATCHFILE="${SCRIPTDIR}/${COUNTRY}-${MATCHYEAR}-matched.txt"
if [ -f "${MATCHFILE}" ]; then
if [[ ! -z "${YEARS}" ]]; then
YEARS="${YEARS}\t${MATCHYEAR}"
fi
if [[ ! -z "${FIELD}" ]]; then
# find matches for a specific field
case "${FIELD}" in
"title")
INDEX=4
;;
"abstract")
INDEX=5
;;
"claims")
INDEX=6
;;
"description")
INDEX=7
;;
*)
echo -e "Unknown field ${FIELD}"
exit 1
;;
esac
COUNT=`c1grep -F -f "${MATCHFILE}" "${INFOFILE}" | cut -f"${INDEX}" | c1grep -cP "${SINGLEFIELDPATTERN}"`
elif [[ -z "${COUNT_PDFS}" ]]; then
COUNT=`c1grep -cF -f "${MATCHFILE}" "${INFOFILE}"`
else
# find rows with a missing text field and with PDFs available
COUNT=`c1grep -F -f "${MATCHFILE}" "${INFOFILE}" | cut -f4-8 | c1grep -vP "${FIELDPATTERN}" | c1grep -P "${PDFPATTERN}" | wc -l`
fi
COUNTS="${COUNTS}\t${COUNT}"
if [[ "${YEAR}" = "${YEAR_END}" ]]; then
TOTAL=`wc -l "${MATCHFILE}" | cut -d' ' -f1`
TOTALS="${TOTALS}\t${TOTAL}"
fi
fi
done
# print output
if [[ ! -z "${YEARS}" ]]; then
echo -e "${YEARS}"
YEARS=""
fi
echo -e "${YEAR}${COUNTS}"
done
echo -e "\n${TOTALS}"