From 56823e3c787fdc6f88339fc109f3a9c7102b9813 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Hohwiller?= Date: Sun, 26 Jan 2025 11:24:38 +0100 Subject: [PATCH] JavaDoc, code-style --- .../github/mmm/text/ascii/AsciiConverter.java | 25 ++++++++++++++----- .../mmm/text/ascii/impl/AsciiBuilder.java | 4 +-- .../ascii/impl/AsciiConverterConfigImpl.java | 1 + .../text/ascii/impl/AsciiConverterImpl.java | 21 +++++++++++++--- .../io/github/mmm/text/ascii/impl/Char.java | 2 +- .../io/github/mmm/text/ascii/impl/Chars.java | 2 +- .../mmm/text/ascii/impl/CodePointMapping.java | 14 ++++++++++- .../io/github/mmm/text/ascii/impl/Factor.java | 2 +- .../github/mmm/text/ascii/impl/Letters.java | 12 ++++----- .../io/github/mmm/text/ascii/impl/Number.java | 2 +- .../mmm/text/ascii/impl/StructuredWord.java | 2 +- .../io/github/mmm/text/ascii/impl/Unit.java | 2 +- .../io/github/mmm/text/ascii/impl/Word.java | 10 +++++++- .../io/github/mmm/text/ascii/impl/Xxx.java | 2 +- ascii/src/main/java/module-info.java | 4 ++- 15 files changed, 77 insertions(+), 28 deletions(-) diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/AsciiConverter.java b/ascii/src/main/java/io/github/mmm/text/ascii/AsciiConverter.java index ad1daba..47b4e36 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/AsciiConverter.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/AsciiConverter.java @@ -5,13 +5,16 @@ import io.github.mmm.text.ascii.impl.AsciiConverterImpl; /** - * {@link AsciiConverter} converts Unicode to simple 7-bit ASCII characters. It strips diacritic marks and transliterates non-Latin letters and glyphs to ASCII. - * It is optimized for performance and does not implement perfectly correct transliteration (e.g. it has no state to transliterate a code-point dependening on - * its predecessors).
- * However, it is very helpful to decode strings for use-cases like indexing and searching or if you want to build a {@link String} to be used for restricted - * environments (names of files, folders, etc.) where special characters could cause problems. + * {@link AsciiConverter} converts Unicode to simple 7-bit ASCII characters. It strips diacritic marks and + * transliterates non-Latin letters and glyphs to ASCII. It is optimized for performance and does not implement + * perfectly correct transliteration (e.g. it has no state to transliterate a code-point dependening on its + * predecessors).
+ * However, it is very helpful to decode strings for use-cases like indexing and searching or if you want to build a + * {@link String} to be used for restricted environments (names of files, folders, etc.) where special characters could + * cause problems. * * @since 1.0.0 + * @see java.text.Normalizer */ public interface AsciiConverter { @@ -19,7 +22,17 @@ public interface AsciiConverter { * @param codePoint the {@link String#codePointAt(int) codePoint} to convert. * @return the converted ASCII representation of the given {@link String#codePointAt(int) codePoint}. */ - String convert(int codePoint); + default String convert(int codePoint) { + + return convert(codePoint, AsciiConverterConfig.of()); + } + + /** + * @param codePoint the {@link String#codePointAt(int) codePoint} to convert. + * @param config the {@link AsciiConverterConfig}. + * @return the converted ASCII representation of the given {@link String#codePointAt(int) codePoint}. + */ + String convert(int codePoint, AsciiConverterConfig config); /** * @param text the (unicode) {@link CharSequence} to convert. diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiBuilder.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiBuilder.java index e1a27fe..85844cb 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiBuilder.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiBuilder.java @@ -5,9 +5,9 @@ /** * State for {@link AsciiConverterImpl}. * - * @see CodePointMapping#append(AsciiBuilder, int) + * @see CodePointMapping#append(AsciiBuilder, int, CodePointMapping) */ -class AsciiBuilder { +public class AsciiBuilder { final StringBuilder sb; diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterConfigImpl.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterConfigImpl.java index 3451914..7269e25 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterConfigImpl.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterConfigImpl.java @@ -12,6 +12,7 @@ */ public final class AsciiConverterConfigImpl implements AsciiConverterConfig { + /** The default instance. */ public static final AsciiConverterConfigImpl DEFAULT = new AsciiConverterConfigImpl(); final CaseConversion caseConversion; diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterImpl.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterImpl.java index 9ba209e..2790059 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterImpl.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/AsciiConverterImpl.java @@ -4,6 +4,7 @@ import java.util.PrimitiveIterator.OfInt; +import io.github.mmm.base.text.CaseConversion; import io.github.mmm.text.ascii.AsciiConverter; import io.github.mmm.text.ascii.AsciiConverterConfig; @@ -25,13 +26,23 @@ private AsciiConverterImpl() { } @Override - public String convert(int codePoint) { + public String convert(int codePoint, AsciiConverterConfig config) { CodePointMapping asc = getAsc(codePoint); if (asc == null) { return null; } - return asc.toString(); + String string; + if (config.useLongForms()) { + string = asc.asStringLong(); + } else { + string = asc.asString(); + } + CaseConversion cc = config.caseConversion(); + if (cc != CaseConversion.ORIGINAL_CASE) { + string = cc.convert(string); + } + return string; } private static CodePointMapping getAsc(int codePoint) { @@ -251,8 +262,10 @@ public String convert(CharSequence text, AsciiConverterConfig config) { next = current.append(builder, codePoint, next); } } - // since text is not empty, we entered the while loop, next is the last mapping to process due to buffering. - next.append(builder, codePoint, null); + if (next != null) { + next.append(builder, codePoint, null); + // } else { fail(); } // since text is not empty, we entered the while loop, and next can never be null. + } return builder.getAscii(); } diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Char.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Char.java index 195fe80..b32ce14 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Char.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Char.java @@ -5,7 +5,7 @@ /** * Implementation of {@link CodePointMapping} for static {@link Character}. */ -class Char extends AbstractCodePointMapping { +public class Char extends AbstractCodePointMapping { // other non-letters static final Char TAB = new Char('\t'); diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Chars.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Chars.java index 97afd81..87c4fdd 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Chars.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Chars.java @@ -5,7 +5,7 @@ /** * Implementation of {@link CodePointMapping} for static {@link String}. */ -class Chars extends AbstractCodePointMapping { +public class Chars extends AbstractCodePointMapping { final String s; diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/CodePointMapping.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/CodePointMapping.java index 217f954..f441e35 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/CodePointMapping.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/CodePointMapping.java @@ -5,7 +5,7 @@ /** * Interface for the mapping result of a code-point. */ -interface CodePointMapping { +public interface CodePointMapping { /** * @param builder the {@link AsciiBuilder}. @@ -15,18 +15,30 @@ interface CodePointMapping { */ CodePointMapping append(AsciiBuilder builder, int codePoint, CodePointMapping next); + /** + * @return the {@link #toString() string representation}. + */ String asString(); + /** + * @return a longer {@link #toString() string representation} as available (may be the same as {@link #asString()}). + */ default String asStringLong() { return asString(); } + /** + * @return the {@link Type}. + */ default Type getType() { return Type.OTHER; } + /** + * {@link Enum} with the available {@link CodePointMapping#getType() type}s for a {@link CodePointMapping}. + */ enum Type { TEXT, diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Factor.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Factor.java index a0d58ad..88745cd 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Factor.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Factor.java @@ -5,7 +5,7 @@ /** * Common factors used in physics, etc. */ -class Factor extends Fraction { +public class Factor extends Fraction { static final Factor TERA = new Factor(1_000_000_000_000L, 1, "T", "tera"); diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Letters.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Letters.java index c32ff32..0ecd733 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Letters.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Letters.java @@ -5,7 +5,7 @@ /** * Extends {@link Chars} for multiple latin letters. It typically represents a syllabe or sound (unlike a {@link Word}). */ -class Letters extends Chars { +public class Letters extends Chars { Letters(String s) { @@ -13,17 +13,17 @@ class Letters extends Chars { assert isLetters(s); } - private boolean isLetters(String s) { + private boolean isLetters(String string) { - int len = s.length(); + int len = string.length(); if (len < 2) { - throw new IllegalArgumentException(s + " - expected at least two letters."); + throw new IllegalArgumentException(string + " - expected at least two letters."); } for (int i = 0; i < len; i++) { - int codePoint = s.codePointAt(i); + int codePoint = string.codePointAt(i); if (!isLatinLetter(codePoint)) { throw new IllegalArgumentException( - s + " - at index " + i + " illegal code point 0x" + Long.toHexString(codePoint)); + string + " - at index " + i + " illegal code point 0x" + Long.toHexString(codePoint)); } } return true; diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Number.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Number.java index af3061d..1b66f68 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Number.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Number.java @@ -5,7 +5,7 @@ /** * {@link CodePointMapping} for a numeric value sign. */ -interface Number extends CodePointMapping { +public interface Number extends CodePointMapping { /** * @return the numerator as {@code long}. The actual value is {@link #getNumerator()}/{@link #getDenominator()}. diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/StructuredWord.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/StructuredWord.java index 799536a..dc71c20 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/StructuredWord.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/StructuredWord.java @@ -6,7 +6,7 @@ * Extends {@link Word} for a structured term that has a {@link #asStringLong() long form} where as {@link #asString()} * typically is an abbreviation. */ -class StructuredWord extends Word { +public class StructuredWord extends Word { final String full; diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Unit.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Unit.java index 81d22cd..9fc0664 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Unit.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Unit.java @@ -5,7 +5,7 @@ /** * Extends {@link Letters} for a (physical) unit. */ -class Unit extends StructuredWord { +public class Unit extends StructuredWord { static final Unit ACRE = new Unit("a", "acre"); diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Word.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Word.java index 075f60d..2b125b7 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Word.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Word.java @@ -5,7 +5,7 @@ /** * Extends {@link Letters} for a word. Subsequent words will be separated by space. */ -class Word extends Chars { +public class Word extends Chars { Word(String s) { @@ -18,6 +18,14 @@ public Type getType() { return Type.TEXT; } + /** + * Default part of {@link #append(AsciiBuilder, int, CodePointMapping)} that can be overridden by sub-classes. + * + * @param builder the {@link AsciiBuilder}. + * @param codePoint the original {@link String#codePointAt(int) code point}. + * @param next the next {@link CodePointMapping}. + * @return an {@link AsciiState} for stateful processing. + */ protected CodePointMapping doAppend(AsciiBuilder builder, int codePoint, CodePointMapping next) { return super.append(builder, codePoint, next); diff --git a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Xxx.java b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Xxx.java index 5eff1a6..8ad07d2 100644 --- a/ascii/src/main/java/io/github/mmm/text/ascii/impl/Xxx.java +++ b/ascii/src/main/java/io/github/mmm/text/ascii/impl/Xxx.java @@ -14,7 +14,7 @@ *
  • {@code WD_*} for word
  • * */ -abstract class Xxx { +public abstract class Xxx { // strings diff --git a/ascii/src/main/java/module-info.java b/ascii/src/main/java/module-info.java index 019dc3d..885a7d8 100644 --- a/ascii/src/main/java/module-info.java +++ b/ascii/src/main/java/module-info.java @@ -4,7 +4,9 @@ */ /** - * Conversion from unicode to ASCII (simplified transliteration) and related features. + * Conversion from Unicode to ASCII (simplified transliteration) and related features. + * + * @see io.github.mmm.text.ascii.AsciiConverter */ module io.github.mmm.text.ascii {