rdf+bcp47+hxl (#41): partial refactoring; started roundtrip tests

EticaAI · Jun 5, 2022 · a646fe6 · a646fe6
1 parent 49dcdac
commit a646fe6
Show file tree

Hide file tree

Showing 3 changed files with 118 additions and 157 deletions.
diff --git a/officina/999999999/0/L999999999_0.py b/officina/999999999/0/L999999999_0.py
@@ -612,7 +612,18 @@ def bcp47_langtag(
 def bcp47_langtag_callback_hxl(
         langtag_meta: dict,
         strictum: bool = True
-) -> dict:
+) -> str:
+    """bcp47_langtag_callback_hxl convert a bcp47_langtag meta to hxl attributes
+
+    Args:
+        langtag_meta (dict): a bcp47_langtag compatible metadata
+        strictum (bool, optional): (not implemented yet). Defaults to True.
+
+    Returns:
+        str: return HXL attributes (without HXL hashtag)
+    """
+
+
     resultatum = []
     # resultatum.append('+todo')
     resultatum.append('+i_{0}'.format(langtag_meta['language'].lower()))
@@ -636,16 +647,6 @@ def bcp47_langtag_callback_hxl(
         if _r['rdf:subject'] and len(_r['rdf:subject']) > 0:
             for item in _r['rdf:subject']:
                 subject_key, subject_namespace, _nop = item.lower().split(':')
-                # raise ValueError(item)
-                # item_num = int(''.join(filter(str.isdigit, item)))
-                # # item_prefix = ''.join(filter(str.isdigit, item, ))
-                # item_prefix = item.replace(str(item_num), '')
-                # item_prefix = item_prefix.encode("unicode_escape").decode()
-                # item_prefix = item_prefix.replace('\\', '')
-                # # item_prefix = item_prefix.encode().decode()
-                # # prefix, term = item.lower().split(':')
-                # resultatum.append(
-                #     '+rdf_s_{0}_{1}'.format(item_prefix, item_num))
                 resultatum.append(
                     '+rdf_s_{0}_s{1}'.format(subject_key, subject_namespace))
 
@@ -685,22 +686,6 @@ def bcp47_langtag_callback_hxl(
                 resultatum.append('+rdf_y_{0}_{1}'.format(
                     EXTRA_OPERATORS['GS']['hxl'], value_separator))
 
-        # if 'csvw:separator' in _r and \
-        #         _r['csvw:separator'] and len(_r['csvw:separator']) > 0:
-        #     decoded_separator = None
-        #     for decoded, value in CSVW_SEPARATORS.items():
-        #         if _r['csvw:separator'] == value:
-        #             decoded_separator = decoded
-        #             break
-        #     if decoded_separator is None:
-        #         raise NotImplementedError(
-        #             '[{0}] [{1}] not implemented in <{2}>'.format(
-        #                 _r['csvw:separator'], langtag_meta, CSVW_SEPARATORS
-        #             ))
-
-        #     resultatum.append(
-        #         '+rdf_y_csvwseparator_{0}'.format(decoded_separator))
-
     resultatum = sorted(resultatum)
 
     return ''.join(resultatum)
@@ -910,101 +895,6 @@ def bcp47_rdf_extension(
             r_op_2 = r_op_2.lower()
 
             r_rest = r_rest - 3
-            continue
-            r_item_key = r_parts[r_parts_tot - r_rest]
-            r_item_value = r_parts[r_parts_tot - r_rest + 1]
-            if r_item_key.startswith('p'):
-                _predicates.append('{0}:{1}'.format(
-                    r_item_key.lstrip('p').lower(),
-                    r_item_value
-                ))
-
-            # sU2200
-            # elif r_item_key.lower().startswith('su'):
-            #     _subjects.append('∀{0}'.format(
-            #         r_item_value.lstrip('s')
-            #     ))
-            # elif r_item_key.lower().startswith('ss'):
-            #     _subjects.append('{0}'.format(
-            #         r_item_value.lstrip('s')
-            #     ))
-
-            # # oU1F517
-            # elif r_item_key.lower().startswith('ou'):
-            #     _objects.append('🔗{0}'.format(
-            #         r_item_value.lstrip('o')
-            #     ))
-
-            # exemplum: sU2200 (if using unicode as prefix, assume is key)
-            elif r_item_key.lower().startswith('su'):
-                _subjects.append('{0}:{1}'.format(
-                    r_item_key.lstrip('s'), r_item_value.lstrip('s')
-                ))
-            # exemplum: sS (not using unicode as key, assuming is just
-            #           a pointer, not the pivoct column)
-            elif r_item_key.lower().startswith('ss'):
-                _subjects.append('{0}:{1}'.format(
-                    '_' + r_item_key.lower().lstrip(
-                        's'), r_item_value.lstrip('s')
-                ))
-
-            # exemplum: oU1F517
-            # @TODO removing test data for this. Maybe re-add later
-            elif r_item_key.lower().startswith('ou'):
-                _objects.append('{0}:{1}'.format(
-                    r_item_key.lstrip('o'), r_item_value.lstrip('o')
-                ))
-
-            elif r_item_key.startswith('t'):
-                if result['rdfs:Datatype'] is None:
-                    result['rdfs:Datatype'] = '{0}:{1}'.format(
-                        r_item_key.lstrip('t').lower(),
-                        r_item_value
-                    )
-                else:
-                    result['_error'].append(
-                        'rdfs:Datatype [{0}]-[{1}]'.format(
-                            r_item_key,
-                            r_item_value
-                        ))
-
-            elif r_item_key.lower().startswith('ycsvw'):
-                if r_item_key.lower() == 'ycsvwseparator':
-                    r_item_value = r_item_value.lower()
-                    r_item_value_enc = '__error__'
-                    # @TODO implement in pure python encode/decode. This is
-                    #       an obvious ugly hacky
-                    if r_item_value in CSVW_SEPARATORS:
-                        r_item_value_enc = CSVW_SEPARATORS[r_item_value]
-
-                    else:
-                        raise NotImplementedError(
-                            'Sorry, separator [{0}] of [{1}] not implemented. '
-                            'This may be a bug. '
-                            'Hardcoded options <{2}> '.format(
-                                r_item_value, rem, CSVW_SEPARATORS))
-
-                    result['csvw:separator'] = '{0}'.format(
-                        # r_item_value
-                        r_item_value_enc
-                    )
-                    # result['_error'].append(
-                    #     'csvw:??? [{0}]-[{1}]'.format(
-                    #         r_item_key,
-                    #         r_item_value
-                    #     ))
-
-            elif r_item_key.lower() == 'yprefix':
-                if 'prefix' not in result:
-                    result['prefix'] = []
-                result['prefix'].append(r_item_value.lower())
-
-            else:
-                result['_error'].append('Unknown [{0}-{1}]'.format(
-                    r_item_key,
-                    r_item_value
-                ))
-            r_rest = r_rest - 2
 
         # if len(_predicates) > 0:
         #     _predicates.sort()
@@ -2594,6 +2484,7 @@ def hxl_hashtag_to_bcp47(hashtag: str) -> str:
                 'rdf:predicate': [],
                 'rdf:object': [],
                 'rdfs:Datatype': None,
+                'xsl:transform': [],
                 '_unknown': [],
                 '_error': [],
                 # 'csvw:separator': '', # Added only if necessary
@@ -2654,31 +2545,21 @@ def hxl_hashtag_to_bcp47(hashtag: str) -> str:
                 _subject_code = _subject_code.upper()
                 _subjec_value = _subjec_value.replace('s', '')
 
-                # @TODO get a value like '∀0' instead of 'u2200:0'
-                # _subject_code = ("\u2200").encode().decode('utf-8')
-                # _subject_code = ("\u2200")
-                # _subject_code = ("\u2200")
-                # _subject_code = str(ord("\u2200"))
-                # _subject_code = str(ord("\u2200"))
-                # _subject_code = ("\\u{0}".format(str(2200)))
-                # _subject_code = (r"\u2200").encode().decode('utf-8')
-                # _subject_code = ("\u2200").encode().decode('utf-8')
                 _subjec_value = _subjec_value.replace('s', '')
-                # result['extension']['r']['rdf:subject'].append(_subject)
                 result['extension']['r']['rdf:subject'].append('{0}:{1}'.format(
                     _subject_code, _subjec_value
                 ))
 
-                _bpc47_g_parts.append('s{0}-s{1}'.format(
+                _bpc47_g_parts.append('s{0}-s{1}-snop'.format(
                     _subject_code, _subjec_value
                 ))
 
             elif item.startswith('p_'):
                 _predicate = item.replace('p_', '').replace('_', ':')
                 result['extension']['r']['rdf:predicate'].append(_predicate)
-                _predicate_key, _object = _predicate.split(':')
-                _bpc47_g_parts.append('p{0}-{1}'.format(
-                    _predicate_key.upper(), _object
+                _predicate_key, _object, _subject= _predicate.split(':')
+                _bpc47_g_parts.append('p{0}-p{1}-p{2}'.format(
+                    _predicate_key.upper(), _object, _subject
                 ))
 
             elif item.startswith('y_'):
@@ -2700,11 +2581,11 @@ def hxl_hashtag_to_bcp47(hashtag: str) -> str:
                                 _tvalue, hashtag, CSVW_SEPARATORS
                             ))
 
-                    result['extension']['r']['csvw:separator'] = \
-                        decoded_separator
+                    # result['extension']['r']['csvw:separator'] = \
+                    #     decoded_separator
                     # _predicate_key, _object = _predicate.split(':')
-                    _bpc47_g_parts.append('yCSVWseparator-{0}'.format(
-                        decoded_separator
+                    _bpc47_g_parts.append('y{0}-y{1}'.format(
+                        EXTRA_OPERATORS['GS']['hxl'].upper(), decoded_separator
                     ))
                 elif _tkey == 'prefix':
                     if 'prefix' not in result['extension']['r']:

diff --git a/officina/999999999/0/linguacodex.py b/officina/999999999/0/linguacodex.py
@@ -298,12 +298,17 @@
 
 5. Extras from EticaAI/lexicographi-sine-finibus application
    {0} --de_bcp47_simplex \
---de_codex=qcc-Zxxx-r-pDCT-modified-txsd-datetime
+--de_codex=qcc-Zxxx-r-pDCT-pmodified-ps1-txsd-tdatetime-tnop \
+--quod=.Language-Tag_normalized
 
 (Non-sense attribute and hashtag, but all options)
-   {0} --de_bcp47_simplex \
---de_codex=qcc-Zxxx-r-sU2203-s2-snop-yU001D-yu007c-ynop-yU0002-yunescothes\
--ynop-pSKOS-pbroader-ps2-tXSD-tdatetime-tnop
+   {0} --de_bcp47_simplex --de_codex=qcc-Zxxx-r-sU2203-s2-snop-\
+yU001D-yu007c-ynop-yU0002-yunescothes-ynop-pSKOS-pbroader-ps2-\
+tXSD-tdatetime-tnop
+
+   {0} --de_hxl_simplex --de_hxlhashtag=\
+'#item+i_qcc+is_zxxx+rdf_p_skos_broader_s2+rdf_s_u2203_s2+rdf_t_xsd_datetime\
++rdf_y_u0002_unescothes+rdf_y_u001d_u007c'
 
    {0} --de_hxl_simplex --de_hxlhashtag=\
 '#item+i_qcc+is_zxxx+rdf_s_u2200_s0+rdf_p_SKOS_related+ix_wikip123' \

diff --git a/officina/999999999/1568346/bcp47-to-hxl-to-rdf.sh b/officina/999999999/1568346/bcp47-to-hxl-to-rdf.sh
@@ -77,7 +77,7 @@ bcp47_to_hxl_to_rdf__tests() {
 }
 
 #######################################
-# bcp47_to_hxl_to_rdf__tests
+# test_unesco_thesaurus
 #
 # Globals:
 #   ROOTDIR
@@ -100,30 +100,105 @@ test_unesco_thesaurus() {
     --rdf-namespaces-archivo="${archivum__namespace}" \
     "${archivum__unesco_thesaurus_bcp47}" |
     rapper --quiet --input=turtle --output=turtle /dev/fd/0 \
-    > "${archivum__resultata_bag1}"
+      >"${archivum__resultata_bag1}"
 
   "${ROOTDIR}/999999999/0/999999999_54872.py" \
     --objectivum-formato=_temp_bcp47 \
     --rdf-bag=2 \
     --rdf-namespaces-archivo="${archivum__namespace}" \
     "${archivum__unesco_thesaurus_bcp47}" |
     rapper --quiet --input=turtle --output=turtle /dev/fd/0 \
-    > "${archivum__resultata_bag2}"
+      >"${archivum__resultata_bag2}"
+
+  # riot --output=Turtle \
+  riot --time --output=RDF/XML \
+    "${archivum__resultata_bag1}" \
+    "${archivum__resultata_bag2}" \
+    >"${archivum__resultata_xml}"
+
+  riot --time --output=Turtle \
+    "${archivum__resultata_xml}" \
+    >"${archivum__resultata_ttl}"
+
+  riot --validate "${archivum__resultata_ttl}"
+}
+
+#######################################
+# test_unesco_thesaurus
+#
+# Globals:
+#   ROOTDIR
+# Arguments:
+#   None
+# Outputs:
+#   Test result
+#######################################
+bcp47_and_hxlrdf_roundtrip() {
+  bpc47="${1-""}"
+  hxlattr="${2-""}"
+  bpc47_final="${4-""}"
+  hxlattr_final="${3-""}"
+
+  hxlattr_discovered=""
+  hxlattr_discovered_2nd=""
+  bpc47_discovered=""
+  bpc47_discovered_2nd=""
+
+  if [ -n "$bpc47" ]; then
+    echo "[$bpc47] bpc47 input"
+
+    hxlattr_discovered=$("${ROOTDIR}/999999999/0/linguacodex.py" \
+      --de_bcp47_simplex --de_codex="$bpc47" \
+      --quod=._callbacks.hxl_attrs)
+
+    hxlattr_discovered=${hxlattr_discovered//\"/}
+    echo "[$hxlattr_discovered] hxlattr_discovered"
+
+    bpc47_discovered_2nd=$("${ROOTDIR}/999999999/0/linguacodex.py" \
+      --de_hxl_simplex --de_hxlhashtag="#item${hxlattr_discovered}" \
+      --quod=.Language-Tag_normalized)
+
+    bpc47_discovered_2nd=${bpc47_discovered_2nd//\"/}
+    echo "[$bpc47_discovered_2nd] bpc47_discovered_2nd"
+  else
+    echo "noop bpc47"
+  fi
+
+  if [ -n "$hxlattr" ]; then
+    echo "[$hxlattr] hxlattr input"
+
+    bpc47_discovered=$("${ROOTDIR}/999999999/0/linguacodex.py" \
+      --de_hxl_simplex --de_hxlhashtag="#item${hxlattr}" \
+      --quod=.Language-Tag_normalized)
+
+    bpc47_discovered=${bpc47_discovered//\"/}
+    echo "[$bpc47_discovered] bpc47_discovered"
+
+    hxlattr_discovered_2nd=$("${ROOTDIR}/999999999/0/linguacodex.py" \
+      --de_bcp47_simplex --de_codex="$bpc47_discovered" \
+      --quod=._callbacks.hxl_attrs)
 
-    # riot --output=Turtle \
-    riot --time --output=RDF/XML \
-      "${archivum__resultata_bag1}" \
-      "${archivum__resultata_bag2}" \
-      > "${archivum__resultata_xml}"
+    hxlattr_discovered_2nd=${hxlattr_discovered_2nd//\"/}
+    echo "[$hxlattr_discovered_2nd] hxlattr_discovered_2nd"
 
-    riot --time --output=Turtle \
-      "${archivum__resultata_xml}" \
-      > "${archivum__resultata_ttl}"
+  else
+    echo "noop hxlattr"
+  fi
+  return 0
 
-    riot --validate "${archivum__resultata_ttl}"
 }
 
 # echo "test"
 
 # bcp47_to_hxl_to_rdf__tests
-test_unesco_thesaurus
+# test_unesco_thesaurus
+
+echo ""
+echo "    test1"
+bcp47_and_hxlrdf_roundtrip "qcc-Zxxx-r-sU2203-s2-snop" ""
+echo ""
+echo "    test2"
+bcp47_and_hxlrdf_roundtrip "" "+i_qcc+is_zxxx+rdf_s_u2203_s2"
+echo ""
+echo "    test3"
+bcp47_and_hxlrdf_roundtrip "qcc-Zxxx-r-sU2203-s2-snop-yU001D-yu007c-ynop-yU0002-yunescothes-ynop-pSKOS-pbroader-ps2-tXSD-tdatetime-tnop" ""