Skip to content

Commit

Permalink
! improve uq and add its documents
Browse files Browse the repository at this point in the history
  • Loading branch information
oldratlee committed Feb 19, 2021
1 parent e48fac9 commit 513a869
Show file tree
Hide file tree
Showing 4 changed files with 394 additions and 20 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ source <(curl -fsSL https://raw.githubusercontent.com/oldratlee/useful-scripts/r
彩色`cat`出文件行,方便人眼区分不同的行。
1. [a2l](docs/shell.md#-a2l)
按行彩色输出参数,方便人眼查看。
1. [uq](docs/shell.md#-uq)
不重排序输入完成整个输入行的去重。相比系统的`uniq`命令加强的是可以跨行去重,不需要排序输入。
1. [ap and rp](docs/shell.md#-ap-and-rp)
批量转换文件路径为绝对路径/相对路径,会自动跟踪链接并规范化路径。
1. [tcp-connection-state-counter](docs/shell.md#-tcp-connection-state-counter)
Expand Down
74 changes: 74 additions & 0 deletions bin/helper/uq.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#!/usr/local/bin/awk -f

function printResult(for_lines) {
for (idx = 0; idx < length(for_lines); idx++) {
line=for_lines[idx]
count=line_count_array[storeLine(line)]

#printf "DEBUG 1: %7s %s, index: %s\n", count, line, idx

if (uq_opt_only_unique) {
if (count == 1) printLine(count, line)
} else {
#printf "DEBUG 2: %7s %s uq_opt_only_repeated: %s\n", count, line, uq_opt_only_repeated

if (uq_opt_only_repeated && count <= 1) {
continue
}

if (uq_opt_repeated_method == "prepend" || uq_opt_repeated_method == "separate" && outputted) {
if (!compareLine(line, outputted)) print ""
}

printLine(count, line)
outputted=line
}
}
}

function printLine(count, line) {
if (uq_opt_count) {
printf "%7s %s%s", count, line, ORS
} else {
print line
}
}

function storeLine(line) {
if (uq_opt_ignore_case) {
return tolower(line)
} else {
return line
}
}

function compareLine(line1, line2) {
return storeLine(line1) == storeLine(line2)
}


BEGIN {
if (uq_opt_zero_terminated) {
RS = "\0"
ORS = "\0"
}
}


{
# use index to keep lines order
lines[line_index++] = $0

store_line=storeLine($0)
# line_count_array: line content -> count
if (++line_count_array[store_line] == 1) {
# use index to keep lines order
deduplicated_lines[deduplicated_line_index++] = store_line
}
}


END {
if (uq_opt_all_repeated) printResult(lines)
else printResult(deduplicated_lines)
}
207 changes: 193 additions & 14 deletions bin/uq
Original file line number Diff line number Diff line change
@@ -1,21 +1,200 @@
#!/bin/bash
# @Function
# print uniq line keep order, no sorting required
# Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
# same as `uniq` command in core utils,
# but detect repeated lines that are not adjacent, no sorting required.
#
# @Usage
# uq [OPTION]... [INPUT [OUTPUT]]
#
# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-2.x/docs/shell.md#-uq
# @author Zava Xu (zava.kid at gmail dot com)
# @author Jerry Lee (oldratlee at gmail dot com)
set -eEuo pipefail

PROG="$(basename "$0")"
PROG_PATH="$(readlink -f "$0")"
PROG_DIR="$(dirname "$PROG_PATH")"

################################################################################
# util functions
################################################################################

# NOTE: $'foo' is the escape sequence syntax of bash
readonly ec=$'\033' # escape char
readonly eend=$'\033[0m' # escape end
readonly nl=$'\n' # new line

redEcho() {
[ -t 1 ] && echo "${ec}[1;31m$*$eend" || echo "$*"
}

yellowEcho() {
[ -t 1 ] && echo "${ec}[1;33m$*$eend" || echo "$*"
}

die() {
redEcho "Error: $*" 1>&2
exit 1
}

usage() {
local -r exit_code="${1:-0}"
(($# > 0)) && shift
# shellcheck disable=SC2015
[ "$exit_code" != 0 ] && local -r out=/dev/stderr || local -r out=/dev/stdout

(($# > 0)) && redEcho "$*$nl" >$out

cat >$out <<EOF
Usage: ${PROG} [OPTION]... [INPUT [OUTPUT]]
Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output).
Same as \`uniq\` command in core utils,
but detect repeated lines that are not adjacent, no sorting required.
outputUniqLines() {
awk '{
s[$0]++
}
END {
for(v in s) {
if (s[v] == 1) {
print v
}
}
}'
Example:
# only one file, output to stdout
uq in.txt
# more than 1 file, last file argument is output file
uq in.txt out.txt
# when use - as output file, output to stdout
uq in1.txt in2.txt -
Options:
-c, --count prefix lines by the number of occurrences
-d, --repeated only print duplicate lines, one for each group
-D print all duplicate lines
combined with -c/-d option usually
--all-repeated[=METHOD] like -D, but allow separating groups
with an empty line;
METHOD={none(default),prepend,separate}
-u, --unique Only output unique lines
that are not repeated in the input
-i, --ignore-case ignore differences in case when comparing
-z, --zero-terminated line delimiter is NUL, not newline
Miscellaneous:
-h, --help display this help and exit
EOF

exit "$exit_code"
}

cat "$@" | outputUniqLines
################################################################################
# parse options
################################################################################

uq_opt_count=0
uq_opt_only_repeated=0
uq_opt_all_repeated=0
uq_opt_repeated_method=none
uq_opt_only_unique=0
uq_opt_ignore_case=0
uq_opt_zero_terminated=0
declare -a argv=()

while (($# > 0)); do
case "$1" in
-c | --count)
uq_opt_count=1
shift
;;
-d | --repeated)
uq_opt_only_repeated=1
shift
;;
-D)
uq_opt_all_repeated=1
shift
;;
--all-repeated=*)
uq_opt_all_repeated=1
uq_opt_repeated_method=$(echo "$1" | awk -F= '{print $2}')
[[ $uq_opt_repeated_method == 'none' || $uq_opt_repeated_method == 'prepend' || $uq_opt_repeated_method == 'separate' ]] ||
usage 1 "$PROG: invalid argument ‘${uq_opt_repeated_method}’ for ‘--all-repeated’${nl}Valid arguments are:$nl - ‘none’$nl - ‘prepend’$nl - ‘separate’"
shift
;;
-u | --unique)
uq_opt_only_unique=1
shift
;;
-i | --ignore-case)
uq_opt_ignore_case=1
shift
;;
-z | --zero-terminated)
uq_opt_zero_terminated=1
shift
;;
-h | --help)
usage
;;
--)
shift
argv=("${argv[@]}" "$@")
break
;;
-)
argv=(${argv[@]:+"${argv[@]}"} "$1")
shift
;;
-*)
usage 2 "${PROG}: unrecognized option '$1'"
;;
*)
argv=(${argv[@]:+"${argv[@]}"} "$1")
shift
;;
esac
done

[[ $uq_opt_only_repeated == 1 && $uq_opt_only_unique == 1 ]] &&
usage 2 "printing duplicated lines(-d, --repeated) and unique lines(-u, --unique) is meaningless"
[[ $uq_opt_all_repeated == 1 && $uq_opt_only_unique == 1 ]] &&
usage 2 "printing all duplicate lines(-D, --all-repeated) and unique lines(-u, --unique) is meaningless"

[[ $uq_opt_all_repeated == 1 && $uq_opt_repeated_method == none && ( $uq_opt_count == 0 && $uq_opt_only_repeated == 0 ) ]] &&
yellowEcho "[$PROG] WARN: -D/--all-repeated=none option without -c/-d option, just cat input simply!" >&2

argc=${#argv[@]}

if ((argc == 0)); then
input_files=()
output_file=/dev/stdout
elif ((argc == 1)); then
input_files=("${argv[0]}")
output_file=/dev/stdout
else
input_files=("${argv[@]:0:argc-1}")
output_file=${argv[argc - 1]}
if [ "$output_file" = - ]; then
output_file=/dev/stdout
fi
fi

# Check input file
for f in ${input_files[@]:+"${input_files[@]}"}; do
# - is stdin, ok
[ "$f" = - ] && continue

[ -e "$f" ] || die "input file $f does not exist!"
[ ! -d "$f" ] || die "input file $f exists, but is a directory!"
[ -f "$f" ] || die "input file $f exists, but is not a file!"
[ -r "$f" ] || die "input file $f exists, but is not readable!"
done

################################################################################
# biz logic
################################################################################

awk \
-v "uq_opt_count=$uq_opt_count" \
-v "uq_opt_only_repeated=$uq_opt_only_repeated" \
-v "uq_opt_all_repeated=$uq_opt_all_repeated" \
-v "uq_opt_repeated_method=$uq_opt_repeated_method" \
-v "uq_opt_only_unique=$uq_opt_only_unique" \
-v "uq_opt_ignore_case=$uq_opt_ignore_case" \
-v "uq_opt_zero_terminated=$uq_opt_zero_terminated" \
-f "$PROG_DIR/helper/uq.awk" \
-- ${input_files[@]:+"${input_files[@]}"} \
>"$output_file"
Loading

0 comments on commit 513a869

Please sign in to comment.