forked from oldratlee/useful-scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
394 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#!/usr/local/bin/awk -f | ||
|
||
function printResult(for_lines) { | ||
for (idx = 0; idx < length(for_lines); idx++) { | ||
line=for_lines[idx] | ||
count=line_count_array[storeLine(line)] | ||
|
||
#printf "DEBUG 1: %7s %s, index: %s\n", count, line, idx | ||
|
||
if (uq_opt_only_unique) { | ||
if (count == 1) printLine(count, line) | ||
} else { | ||
#printf "DEBUG 2: %7s %s uq_opt_only_repeated: %s\n", count, line, uq_opt_only_repeated | ||
|
||
if (uq_opt_only_repeated && count <= 1) { | ||
continue | ||
} | ||
|
||
if (uq_opt_repeated_method == "prepend" || uq_opt_repeated_method == "separate" && outputted) { | ||
if (!compareLine(line, outputted)) print "" | ||
} | ||
|
||
printLine(count, line) | ||
outputted=line | ||
} | ||
} | ||
} | ||
|
||
function printLine(count, line) { | ||
if (uq_opt_count) { | ||
printf "%7s %s%s", count, line, ORS | ||
} else { | ||
print line | ||
} | ||
} | ||
|
||
function storeLine(line) { | ||
if (uq_opt_ignore_case) { | ||
return tolower(line) | ||
} else { | ||
return line | ||
} | ||
} | ||
|
||
function compareLine(line1, line2) { | ||
return storeLine(line1) == storeLine(line2) | ||
} | ||
|
||
|
||
BEGIN { | ||
if (uq_opt_zero_terminated) { | ||
RS = "\0" | ||
ORS = "\0" | ||
} | ||
} | ||
|
||
|
||
{ | ||
# use index to keep lines order | ||
lines[line_index++] = $0 | ||
|
||
store_line=storeLine($0) | ||
# line_count_array: line content -> count | ||
if (++line_count_array[store_line] == 1) { | ||
# use index to keep lines order | ||
deduplicated_lines[deduplicated_line_index++] = store_line | ||
} | ||
} | ||
|
||
|
||
END { | ||
if (uq_opt_all_repeated) printResult(lines) | ||
else printResult(deduplicated_lines) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,200 @@ | ||
#!/bin/bash | ||
# @Function | ||
# print uniq line keep order, no sorting required | ||
# Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output). | ||
# same as `uniq` command in core utils, | ||
# but detect repeated lines that are not adjacent, no sorting required. | ||
# | ||
# @Usage | ||
# uq [OPTION]... [INPUT [OUTPUT]] | ||
# | ||
# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-2.x/docs/shell.md#-uq | ||
# @author Zava Xu (zava.kid at gmail dot com) | ||
# @author Jerry Lee (oldratlee at gmail dot com) | ||
set -eEuo pipefail | ||
|
||
PROG="$(basename "$0")" | ||
PROG_PATH="$(readlink -f "$0")" | ||
PROG_DIR="$(dirname "$PROG_PATH")" | ||
|
||
################################################################################ | ||
# util functions | ||
################################################################################ | ||
|
||
# NOTE: $'foo' is the escape sequence syntax of bash | ||
readonly ec=$'\033' # escape char | ||
readonly eend=$'\033[0m' # escape end | ||
readonly nl=$'\n' # new line | ||
|
||
redEcho() { | ||
[ -t 1 ] && echo "${ec}[1;31m$*$eend" || echo "$*" | ||
} | ||
|
||
yellowEcho() { | ||
[ -t 1 ] && echo "${ec}[1;33m$*$eend" || echo "$*" | ||
} | ||
|
||
die() { | ||
redEcho "Error: $*" 1>&2 | ||
exit 1 | ||
} | ||
|
||
usage() { | ||
local -r exit_code="${1:-0}" | ||
(($# > 0)) && shift | ||
# shellcheck disable=SC2015 | ||
[ "$exit_code" != 0 ] && local -r out=/dev/stderr || local -r out=/dev/stdout | ||
|
||
(($# > 0)) && redEcho "$*$nl" >$out | ||
|
||
cat >$out <<EOF | ||
Usage: ${PROG} [OPTION]... [INPUT [OUTPUT]] | ||
Filter lines from INPUT (or standard input), writing to OUTPUT (or standard output). | ||
Same as \`uniq\` command in core utils, | ||
but detect repeated lines that are not adjacent, no sorting required. | ||
outputUniqLines() { | ||
awk '{ | ||
s[$0]++ | ||
} | ||
END { | ||
for(v in s) { | ||
if (s[v] == 1) { | ||
print v | ||
} | ||
} | ||
}' | ||
Example: | ||
# only one file, output to stdout | ||
uq in.txt | ||
# more than 1 file, last file argument is output file | ||
uq in.txt out.txt | ||
# when use - as output file, output to stdout | ||
uq in1.txt in2.txt - | ||
Options: | ||
-c, --count prefix lines by the number of occurrences | ||
-d, --repeated only print duplicate lines, one for each group | ||
-D print all duplicate lines | ||
combined with -c/-d option usually | ||
--all-repeated[=METHOD] like -D, but allow separating groups | ||
with an empty line; | ||
METHOD={none(default),prepend,separate} | ||
-u, --unique Only output unique lines | ||
that are not repeated in the input | ||
-i, --ignore-case ignore differences in case when comparing | ||
-z, --zero-terminated line delimiter is NUL, not newline | ||
Miscellaneous: | ||
-h, --help display this help and exit | ||
EOF | ||
|
||
exit "$exit_code" | ||
} | ||
|
||
cat "$@" | outputUniqLines | ||
################################################################################ | ||
# parse options | ||
################################################################################ | ||
|
||
uq_opt_count=0 | ||
uq_opt_only_repeated=0 | ||
uq_opt_all_repeated=0 | ||
uq_opt_repeated_method=none | ||
uq_opt_only_unique=0 | ||
uq_opt_ignore_case=0 | ||
uq_opt_zero_terminated=0 | ||
declare -a argv=() | ||
|
||
while (($# > 0)); do | ||
case "$1" in | ||
-c | --count) | ||
uq_opt_count=1 | ||
shift | ||
;; | ||
-d | --repeated) | ||
uq_opt_only_repeated=1 | ||
shift | ||
;; | ||
-D) | ||
uq_opt_all_repeated=1 | ||
shift | ||
;; | ||
--all-repeated=*) | ||
uq_opt_all_repeated=1 | ||
uq_opt_repeated_method=$(echo "$1" | awk -F= '{print $2}') | ||
[[ $uq_opt_repeated_method == 'none' || $uq_opt_repeated_method == 'prepend' || $uq_opt_repeated_method == 'separate' ]] || | ||
usage 1 "$PROG: invalid argument ‘${uq_opt_repeated_method}’ for ‘--all-repeated’${nl}Valid arguments are:$nl - ‘none’$nl - ‘prepend’$nl - ‘separate’" | ||
shift | ||
;; | ||
-u | --unique) | ||
uq_opt_only_unique=1 | ||
shift | ||
;; | ||
-i | --ignore-case) | ||
uq_opt_ignore_case=1 | ||
shift | ||
;; | ||
-z | --zero-terminated) | ||
uq_opt_zero_terminated=1 | ||
shift | ||
;; | ||
-h | --help) | ||
usage | ||
;; | ||
--) | ||
shift | ||
argv=("${argv[@]}" "$@") | ||
break | ||
;; | ||
-) | ||
argv=(${argv[@]:+"${argv[@]}"} "$1") | ||
shift | ||
;; | ||
-*) | ||
usage 2 "${PROG}: unrecognized option '$1'" | ||
;; | ||
*) | ||
argv=(${argv[@]:+"${argv[@]}"} "$1") | ||
shift | ||
;; | ||
esac | ||
done | ||
|
||
[[ $uq_opt_only_repeated == 1 && $uq_opt_only_unique == 1 ]] && | ||
usage 2 "printing duplicated lines(-d, --repeated) and unique lines(-u, --unique) is meaningless" | ||
[[ $uq_opt_all_repeated == 1 && $uq_opt_only_unique == 1 ]] && | ||
usage 2 "printing all duplicate lines(-D, --all-repeated) and unique lines(-u, --unique) is meaningless" | ||
|
||
[[ $uq_opt_all_repeated == 1 && $uq_opt_repeated_method == none && ( $uq_opt_count == 0 && $uq_opt_only_repeated == 0 ) ]] && | ||
yellowEcho "[$PROG] WARN: -D/--all-repeated=none option without -c/-d option, just cat input simply!" >&2 | ||
|
||
argc=${#argv[@]} | ||
|
||
if ((argc == 0)); then | ||
input_files=() | ||
output_file=/dev/stdout | ||
elif ((argc == 1)); then | ||
input_files=("${argv[0]}") | ||
output_file=/dev/stdout | ||
else | ||
input_files=("${argv[@]:0:argc-1}") | ||
output_file=${argv[argc - 1]} | ||
if [ "$output_file" = - ]; then | ||
output_file=/dev/stdout | ||
fi | ||
fi | ||
|
||
# Check input file | ||
for f in ${input_files[@]:+"${input_files[@]}"}; do | ||
# - is stdin, ok | ||
[ "$f" = - ] && continue | ||
|
||
[ -e "$f" ] || die "input file $f does not exist!" | ||
[ ! -d "$f" ] || die "input file $f exists, but is a directory!" | ||
[ -f "$f" ] || die "input file $f exists, but is not a file!" | ||
[ -r "$f" ] || die "input file $f exists, but is not readable!" | ||
done | ||
|
||
################################################################################ | ||
# biz logic | ||
################################################################################ | ||
|
||
awk \ | ||
-v "uq_opt_count=$uq_opt_count" \ | ||
-v "uq_opt_only_repeated=$uq_opt_only_repeated" \ | ||
-v "uq_opt_all_repeated=$uq_opt_all_repeated" \ | ||
-v "uq_opt_repeated_method=$uq_opt_repeated_method" \ | ||
-v "uq_opt_only_unique=$uq_opt_only_unique" \ | ||
-v "uq_opt_ignore_case=$uq_opt_ignore_case" \ | ||
-v "uq_opt_zero_terminated=$uq_opt_zero_terminated" \ | ||
-f "$PROG_DIR/helper/uq.awk" \ | ||
-- ${input_files[@]:+"${input_files[@]}"} \ | ||
>"$output_file" |
Oops, something went wrong.