-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathread_sign_list.py
905 lines (839 loc) · 37.4 KB
/
read_sign_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
import codecs
import csv
import re
import sys
import unicodedata
import numbers
sys.stdout = codecs.getwriter("utf-16")(sys.stdout.detach())
SOURCES = ['MesZL', 'Labat', 'ABZ']
def is_printable_basic_latin(c):
return c >= "!" and c <= "~"
def is_lowercase_akkadian_letter(c):
return c in 'bdgptkʾṭqzšsṣḫmnrlwyjaeiu'
def is_capital_akkadian_letter(c):
return c in 'bdgptkʾṭqzšsṣḫmnrlwyjaeiu'.upper()
def is_digit(c):
return c >= '0' and c <= '9'
def is_composition_sign(c):
return c in 'f:⫶/v'
def is_composition_character(c):
return (is_lowercase_akkadian_letter(c) or
is_digit(c) or
is_composition_sign(c) or
c == 'x')
class Reading:
def __init__(self, sign, šašková_index):
self.value = ''
self.comment = ''
self.source = ''
self.disambiguator = ''
self.sign = sign
self.šašková_index = šašková_index
self.keep = True
def composition(self):
return self.value.lower() + self.disambiguator
def normalize(self):
# Properly write aleph, Y is a synonym for J, and we handle variant more
# comprehensively than the single KAMᵛ.
self.value = self.value.strip().replace(
'’', 'ʾ').replace('Y', 'J').replace('v', '')
self.comment = self.comment.replace('’', 'ʾ')
source = re.match('^(\w+)[;:]', self.comment)
if source:
source = source[1]
if source == 'KŠ':
# Kateřina Šašková marks her own comments like this.
# Since we are processing her list, everything is by definition therein;
# the source field tracks provenance from older lists.
return
if source in ('MesZ', 'MeZL', 'MeLZ', 'MesLZ'): # Typos.
source = 'MesZL'
if source not in SOURCES:
raise ValueError('Unexpected source %s' % source)
self.source = source
# See the comments below re. DUN₃ 𒂅, DUN₃ gunû 𒂆, and DUN₃ gunû gunû 𒂇.
DUN3_VARIANTS = {
# http://oracc.museum.upenn.edu/ogsl/signlist/l0068/o0000160/index.html
'DU5': '𒂅',
'DUG5': '𒂅',
'DUN3': '𒂅',
'SU18': '𒂅',
'SUG5': '𒂅',
'TU18': '𒂅',
'TUN3': '𒂅',
'TUG8': '𒂅',
'ṬU': '𒂅',
# In Borger, not in Oracc. Adding it where the SUG reading is.
'SUK5': '𒂅',
# Labat-only readings, not in Oracc; adding those to the variant with the
# DUN/TUN and SU readings.
'SU14': '𒂅',
'ṬUN': '𒂅',
# http://oracc.museum.upenn.edu/ogsl/signlist/l0068/o0000161/index.html
'AGA3': '𒂆',
'GE11': '𒂆',
'GI11': '𒂆',
'GIG4': '𒂆',
'GIM2': '𒂆',
'GIN2': '𒂆',
'PUŠ4': '𒂆',
# http://oracc.museum.upenn.edu/ogsl/signlist/l0080/o0002178/index.html
'ḪURSAG': '𒂅',
# http://oracc.museum.upenn.edu/ogsl/signlist/l0071/o0001279/index.html
'AGARIN3': '𒂆',
# http://oracc.museum.upenn.edu/ogsl/signlist/l0071/o0001367/index.html
'GILGAMEŠ': '𒂆',
'GILGAMES': '𒂆',
# http://oracc.museum.upenn.edu/ogsl/signlist/l0090/o0002642/index.html
'NIR2': '𒂆',
# Labat-only reading, not in Oracc.
'NINI2': '𒂆',
}
readings_by_value = {}
readings_by_sign = {}
def insert_parentheses(original, amendment):
original_segment = amendment.replace('[', '').replace(']', '')
amended_segment = amendment.replace('[', '(').replace(']', ')')
return original.replace(original_segment, amended_segment)
def delete_parentheses(original, amendment):
original_segment = amendment.replace('[', '(').replace(']', ')')
amended_segment = amendment.replace('[', '').replace(']', '')
return original.replace(original_segment, amended_segment)
with open(r".\sign_list.csv", encoding="utf-8") as file:
reader = csv.reader(file)
ok_entries = 0
erroneous_entries = 0
meszl_seen = {}
row_index = 0
for row in reader:
meszl = row[3]
if meszl in meszl_seen:
meszl_seen[meszl] += 1
meszl += '/%d' % meszl_seen[meszl]
else:
meszl_seen[meszl] = 1
if (not row[0] or
any(is_printable_basic_latin(c) for c in row[0] + row[1]) or
row[0] != row[1]):
if meszl == '003+003\n(839+756+003+003)':
# A spelling of Idiqlat in the MesZL glossary. No sign name, just type
# it as ḪAL.ḪAL.
continue
elif row[2].startswith('UŠUMX\n'):
pass # UŠUMₓ is missing in the Sinacherib font.
elif row[2].startswith('ARAD x ŠE\n'):
continue # Labat has ìrךe but Borger does not; it is not encoded.
elif (row[0] and all(not is_printable_basic_latin(c) for c in row[0]) and
(not row[1] or
(any(is_printable_basic_latin(c) for c in row[1]) and
(all (word.strip() in ('', '.', 'x', 'over', 'inverted', 'crossing',
'opposing',)
for word in re.split('[^!-~]', row[1])))))):
pass # Signs missing in the Sinacherib font.
elif '𒄒' in row[0] and row[1] == row[0].replace('𒄒', '𒁉𒑖'):
# The Sinacherib font has a GIŠ crossing GIŠ which does not look like
# the neo-Assyrian KIB; these should be unified, and a neo-Assyrian font
# should have the KIB glyph for that code point.
pass
elif meszl == '58':
continue # 𒅗×𒌍 is an unencoded variant of 𒅗×𒊓 = 𒅾.
elif meszl in (
'27',
'36', # HZL 137: unbekannte Bedeutung (Gegenstand aus Holz).
'40', # HZL 138: Gerät?, Behälter? aus Kupfer.
'41', # HZL 139: ein Behälter aus Holz.
'55',
'67', # HZL 150: Körperteilbezeichnung?
'70', # HZL 142: u.B.
'156',
'194',
'224',
'243',
'278',
'282',
'319/2',
'322',
'393',
'408/2',
'454',
'488',
'518',
'524',
'647',
'680',
'697',
'763',
'886',
):
# Signs from https://www.unicode.org/wg2/docs/n4277.pdf.
pass
elif 'BAD squared' in row[2]:
# We unify BAD squared with IDIM over IDIM squared, since IDIM is part
# of BAD in both Labat and Borger, and both sign lists mention only a
# squared BAD, not a squared IDIM over IDIM; indeed the latter has no
# reading in Šašková.
pass
elif row[2].startswith('NUN crossing NUN.LAGAR over LAGAR'):
continue # Unified with TUR3 over TUR3, we keep the one with readings.
elif row[2].startswith('TUR3 over TUR3\n'):
pass # See above.
elif row[2].startswith('ŠIR over ŠIR.BUR over BUR'):
pass # Sign missing in the Sinacherib font.
elif ('𒊩𒌆' in row[0] and
row[0] in row[1] and
row[0].replace('𒊩𒌆', '𒊩𒈠') in row[1]
and 'Neo-Assyrian:' in row[1]):
# Prior to the encoding of NIN one had to use either MUNUS.TUG₂ or
# MUNUS.MA, the latter being the neo-Assyrian style. Šašková gives
# both, with a note.
pass
elif meszl == '170 (also 250)':
# Borger lists two variant glyphs of TA×ḪI as separate entries, the
# second one being only a reference to the former. Only one is
# encoded.
pass
elif meszl == '250':
continue # That one is a reference without readings in Šašková.
elif meszl == '250 (also 170)':
# Same as '170 (also 250)', except there is one more reading.
pass
elif row[2].startswith('SA.NI'):
pass # Labat-only sign, no neo-Assyrian form.
elif meszl == '177':
# Borger writes USAN (GÚ×NUN, GÚ-NUN), and thus Šašková gives both
# 𒄛 and 𒄘𒉣. On the other hand for 178, Borger writes
# DUR (GÚ×GAG, GÚ-GAG) yet Šašková gives only 𒄙 and lets the
# neo-Assyrian font handle it by rendering that as GÚ-GAG. Leave the
# variant of USAN up to the font here too; Borger gives only one
# Assyrian glyph anyway.
pass
elif meszl == '189':
# As far as I can tell 𒊕×𒉌 SAG×NI is not encoded. It is attested,
# e.g., https://cdli.ucla.edu/search/archival_view.php?ObjectID=P217023.
# Its reading is unknown. It probably should be encoded.
continue
elif meszl in ('231', '231/2'):
# Same story for 𒀊×𒌋 AB×U, attested, e.g., in
# https://cdli.ucla.edu/search/archival_view.php?ObjectID=P227527.
# Unclear whether AB×AŠ is actually a thing; both are under 231.
continue
elif meszl == '233':
# Similarly for 𒀊×𒆠 AB×KI, but if I am reading Borger correctly that
# one is only attested in one or two tablets (MSL 16 218 211, whatever
# that means exactly). Nothing on CDLI.
continue
elif meszl == '208':
# As far as I can tell NIQ₃ is not encoded; is it even a thing? It comes
# with a great deal of question marks in the litterature.
continue
elif meszl in ('240', '240/2'):
# UM×U-LAGAB, URUDU×U-LAGAB, not encoded.
continue
elif row[2].startswith('URUDU x U'):
# Unencoded variant of UM×U, same number in Borger.
continue
elif row[2].startswith('DUB x ŠA3'):
# DUB׊A₃ is not encoded, UM׊A₃ is. The latter reading is also
# mentioned as Landsberger’s in Borger’s entry 244. Šašková writes “old
# variant of DUB x ŠA3?” in her entry for UM׊A₃; just unify them.
pass
elif row[2].startswith('DUB x LAGAB'):
# Exact same story with DUB×LAGAB vs. UM×LAGAB, 245.
pass
elif meszl == '254':
# KAM₂ has the same neo-Assyrian glyph as GAN (253). In Labat (143),
# the Babylonian glyph is shown as a tilted version of that neo-Assyrian
# glyph. That tilted glyph also appears in Borger as KAMᵛ, in the entry
# 595 for KAM, and in the middle Assyrian section of Labat’s entry 406
# for KAM. Borger gives no Babylonian glyph for KAM₂, so it is possible
# that he calls any tilted GAN KAMᵛ.
# Unicode has U+1219A (KAM2) 𒆚 whose reference glyph is tilted.
# This would match the Babylonian glyphs for KAM₂, or the glyph KAMᵛ.
# Šašková’s list exclaims that KAM2 is the wrong name for that
# character, i.e., that it represents KAMᵛ. There isn’t much intrinsic
# to the standard that implies that: the reference glyphs are
# Babylonian,.so KAM₂ would have this glyph, and KAMᵛ would be an
# unencoded variant. It is unclear whether KAMᵛ is a thing outside of
# Assyrian styles, so it may well be that it need not be encoded by the
# standards of Unicode.
# Indeed KAM appears to be a common transcription of KAMᵛ, and KAM
# written 𒄭×𒁁 seems rare in neo-Assyrian.
# Where Šašková goes with
# 𒄰 = ḪI×BAD = KAM ≠ KAMᵛ = U+1219A 𒆚, KAM₂ = GAN or unencoded,
# we choose
# 𒄰 = ḪI×BAD = KAM = KAMᵛ ≠ KAM₂ = U+1219A 𒆚 KAM2 ≠ GAN.
# This approach is etymologically sound. It also has the advantage of
# being consistent with Oracc conventions, which, being maintained under
# the auspices of Tinney who co-authored the Unicode proposals, are
# probably sound.
# On the flipside, this means that for neo-Assyrian purposes, a font is
# needed that uses the Babylonian glyph for KAM₂ as its glyph for KAM,
# and the same neo-Assyrian glyph for both KAM₂ and GAN.
# Then again neo-Assyrian badly needs a new font anyway, all the
# existing ones are stuck sometime before 2014.
pass
elif meszl == '276':
# Borger writes “Sehr unsicher.” of EZEN×SI?, it is not encoded.
continue
elif meszl == '287':
# See the comments about DUN₃ below.
pass
elif row[2].startswith('KASKAL over KASKAL.LAGAB over LAGAB'):
# It appears that šubtu₄ is not encoded.
continue
elif meszl == '303':
# The neo-Assyrian form is given as KASKAL.UD×EŠ whereas the UR III form
# is given as KASKAL.UD šeššig, even though UD×EŠ and UD šeššig have the
# same neo-Assyrian glyph. Oracc says UD šeššig is correct here, use
# that.
pass
elif meszl == '319':
# An erroneous entry: The sign name is AL×KID₂ (which is MesZL 475,
# encoded), the given sign is 𒉒 × 𒋺 NINDA₂×KID₂, which is not present
# in Borger.
continue
elif meszl == '321':
# NINDA₂×BAN₂, not encoded.
continue
elif meszl == '325':
# NINDA₂×DUB, not encoded, has a question mark in Borger.
continue
elif meszl == '328':
# NINDA₂׊ID, not encoded, also a question mark.
continue
elif meszl == '329':
# NINDA₂×U₂, not encoded, exists in Borger only with the mention
# “Aus ÚR×Ú zu erschliessen?”.
continue
elif meszl in ('333', '333v3', '333v7'):
# The ŠAM₂ variants are a mess. Perhaps they are supposed to be partly
# handled at the font level?
# TODO(egg): In any case it is incorrect to assign the readings only to
# the first variant, and then to discard them because it is not encoded;
# it is easy to find, e.g., NINDA₂׊E AN with the reading ša₁₀:
# https://cdli.ucla.edu/search/archival_view.php?ObjectID=P345814
continue
elif meszl in ('334', '335', '337'):
# More unencoded 𒉒×something signs with no readings.
continue
elif meszl == '355':
# 𒌈 gunû and ×𒃸, not encoded.
continue
elif meszl == '364':
# Borger writes “Wenn es ŠIM×BÚR gegeben hat […]”. Not encoded.
continue
elif meszl == '370':
continue # ŠIM×PI, not encoded.
elif meszl == '379 (sign KAK)':
# KAK × IGI gunû, is not in Sinacherib, KAK.IGI gunû is used instead.
continue
elif ('𒉌𒌓' in row[0] and
row[0] in row[1] and
row[0].replace('𒉌𒌓', '𒉌𒂟') in row[1]
and 'Neo-Assyrian:' in row[1]):
# Prior to the encoding of NA₄ one had to use either NI.UD or NI.ERIM,
# the latter being the neo-Assyrian style. Šašková gives both, with a
# note.
pass
elif row[2].startswith('GA2 x EZEN'):
# Labat-only variant of 𒃢=GA₂×PA, in parentheses in Labat.
# Not encoded.
continue
elif meszl == '423':
continue # Borger writes “unsicher”; not encoded.
elif meszl == '436':
# Unencoded neo-Assyrian ligature of NI and GIŠ, with the neo-Assyrian
# glyph of KISAL.
continue
elif meszl in ('456', '456/2'):
# A sign with uncertain decompositions in Borger, Proto-Ea only. Not
# encoded.
continue
elif meszl == '460/2':
continue # An unencoded variant of 𒁦.
elif '𒁃' in row[0]:
# BAḪAR₂ tends to be decomposed (into 𒂁𒋡𒁓) in Assyrian sign lists,
# but it is its own thing earlier (LAK742) and is encoded separately.
pass
elif meszl == '473':
continue # GU₄ × KASKAL, not encoded.
elif meszl == '488/2':
continue # Alternative decomposition of 𒎘.
elif row[2].startswith('SANGA2\n'):
# In neo-Assyrian 𒊫 looks like 𒅍𒈣𒂀, but Sinacherib does not
# support it.
pass
elif meszl == '520':
continue # Lots of question marks in Borger; not encoded.
elif meszl == '529':
continue # LÚ × KU (oder ähnlich); not encoded.
elif row[2].startswith('ŠU.MIN.MEŠ\n'):
pass # Typo in the neo-Assyrian form (ŠU.MIN.AN.MEŠ).
elif meszl in ('579+?', '579+?+579', '579+579+?'):
continue # TODO(egg): I have no idea what is going on with these.
elif meszl in ('588/2', '588/3'):
continue # Unencoded variants.
elif meszl in ('604', '607'):
continue # Unencoded ŠA₃×something signs.
elif meszl in ('604', '607'):
continue # Unencoded ŠA₃×something signs.
elif meszl in ('624/2', '626'):
continue # Some sort of NUNUZ-based mess.
elif meszl == '636+?':
continue # Illegible sign from Labat’s index.
elif meszl in ('654', '656', '709'):
continue # Numeric signs, we handle those separately anyway.
elif meszl in ('730', '735'):
pass # Variants.
elif meszl in ('741\nalso 882', '882\nalso 741'):
pass # 𒎔 vs. 𒉾.
elif meszl == '746+358+?':
continue # ???
elif row[2].startswith('LAGAB x GAR3\n'):
continue # That’s a lot of question marks.
elif meszl == '757':
pass # Seems to just be the same sign as ENGUR.
elif meszl == '796':
continue # INDA₂ is not encoded.
elif meszl == '811':
continue # No name, side-by-side ligature of existing signs.
elif meszl in ('829/2', '829/3'):
continue # Unencoded variants.
elif meszl == '837':
continue # Numeric sign.
elif meszl == '839+086+298+591':
continue # Needless decomposition of ASAL₂.
elif meszl == '845':
pass # Typo in the UR III form, A.A×A instead of A×A, handled below.
elif row[2].startswith('LAK 852\n'):
pass # LAK 852, missing in Sinacherib.
elif meszl == '870':
# Variants of EN₂. Let’s just pick 𒋙𒀭: looking at Labat, 𒌋𒀭 is the
# classical Sumerian version, before 𒋙 was a thing; this can be handled
# at the font level.
pass
elif meszl.startswith('XXX'):
pass # Ancient signs, not in Borger, not in Sinacherib.
elif row == ['', '', '', '', '', '']:
break # We have reached the end of the table.
else:
raise ValueError(row)
row_index += 1
readings = ' '.join(row[2].split('\n')[1:-1])
uncommented_readings = ''
if not readings:
readings = '()'
# Mismatched parentheses; by MesZL number; entries with identical MesZL number
# are indexed after the slash.
if meszl in ('69', '598/5', '454'):
readings = '(' + readings
elif meszl in (
'848', '45', '84', '129', '187', '193', '202', '223+889+552',
'266 (sign LUGAL)', '302+596', '353/2', '469+809+598+590/2',
'491+380', '491+748', '491+839', '541+184', '545', '724+136',
'737+755', '839+010+387', '839+756+202', '303'
):
readings += ')'
elif meszl in ('001+183', '280 (sign EZEN x MIR)', '575+183', '748+183',
'493 (sign IL2)\nlater:\n493+201+565'):
readings = '(' + readings + ')'
elif meszl in ('242+753', '380+827', '546\nalso 485', '703/2', '883+149', '883+827'):
if readings[-1] != ')':
raise ValueError('No trailing parenthesis to strip from readings in %r' % row)
readings = readings[:-1]
elif meszl in ('13', '184+464+755'):
readings = readings.replace('))),', ')),')
elif meszl in ('701+232+553', '701+232+553/2', '788', '836'):
readings = readings.replace(')),', '),')
elif meszl == '84':
readings = insert_parentheses(readings, '([MesZL: variant of KA x GU (no. 69)];')
elif meszl == '142':
readings = insert_parentheses(readings, '(ŠAR5 = IM (no. 641)]')
elif meszl == '150':
readings = insert_parentheses(readings, '(Labat; MesZL: ŠURU6 = KID2 (no. 106)]')
elif meszl == '010+296':
readings = delete_parentheses(readings, '(= MesZL 296)];')
elif meszl == '296':
readings = delete_parentheses(readings, '(= MesZL 296)];')
elif meszl == '348':
readings = insert_parentheses(readings, '(MesZL: AL x ŠE (no. 479) = IL (no. 348)];')
elif meszl == '362+010+120':
readings = insert_parentheses(readings, ' (nos. 362+010+887+809+807)]')
elif meszl == '479, 348':
readings = insert_parentheses(readings, '(no. 348)];')
elif meszl == '490':
readings = delete_parentheses(readings, 'PU11, PU8 missing)]')
elif meszl == '560+132':
readings = insert_parentheses(readings, '(no. 560)],')
elif meszl in ('809+816+580', '809+816+584'):
readings = delete_parentheses(readings, '[MUPARRU')
elif meszl == '839':
readings = insert_parentheses(readings, '(no. 856)],')
elif meszl == '883+381':
readings = insert_parentheses(readings, '(nos. 382+889)],')
elif meszl == '092, also 585':
readings = insert_parentheses(readings, '([MesZL: see MUŠ (no. 585) and PAB (no. 92)];')
if meszl == '572':
readings = readings.replace(
'((MesZL: instead of KAŠŠEBA, KAŠŠEBI)',
'((MesZL: instead of KAŠŠEBA, KAŠŠEBI);')
if meszl == '577/2' or meszl == '576/2':
# We have these glyphs and their readings for proper letter signs;
# imparting these readings to the punctuation signs (they have separate
# transcriptions for those roles given in MesZL).
continue
if meszl == '863':
# We have two variants of a numeric sign for IMIN already, the use of a
# disunified non-numeric sign is unclear, especially since which variant
# is picked ends up being font-dependent...
continue
if readings[0] != '(' or readings[-1] != ')':
raise ValueError(row)
processed_readings = ''
depth = 0
sign = row[0]
# Unify BAD squared and IDIM over IDIM squared, see above.
sign = sign.replace('.𒁁squared', '𒅄')
sign = sign.replace('𒁁squared', '𒅄')
sign = sign.replace('𒍗squared', '𒅄')
if row[2].startswith('TUR3 over TUR3\n'):
# Borger writes, in Kap. II, entry 147:
# Auch TÙR [over] TÙR, genauer [sign] =
# NUN [over] NUN gekreuzt (n107) - LAGAR [over] LAGAR.
# Accordingly, calling this sign TUR3 over TUR3 is imprecise,
# and certainly it should be unified with
# 𒉬 NUN CROSSING NUN LAGAR OVER LAGAR,
# which matches the decomposition given by Borger and has no readings in
# Šašková.
sign = '𒉬'
# Only one variant of TA×ḪI is encoded.
sign = sign.replace('𒋭\nalso\n𒋫 x 𒄭', '𒋭')
sign = sign.replace('𒋫 x 𒄭\nalso\n𒋭', '𒋭')
# See the comment about USAN above.
sign = sign.replace('𒄛\nand\n𒄘𒉣', '𒄛')
# See the comments about 244 and 245 above.
sign = sign.replace('𒁾 x𒊮', '𒌠')
sign = sign.replace('𒁾 x𒆸', '𒌞')
# For some reason Šašková does not always use 𒌍, which was there in the
# initial Unicode 5.0 character set.
sign = sign.replace('𒌋𒌋𒌋', '𒌍')
# Use the signs from https://www.unicode.org/wg2/docs/n4277.pdf.
# Global substitutions: U.U, ME.EŠ, MUNUS.TUG₂, NI.UD, MUNUS.KU, MI.NUNUZ,
# NI.ERIM, ḪI.GIR₃ are always MAN, MEŠ, NIN, NA₄,NIN₉, GIG, DAG₃, ḪUS
# respectively.
sign = sign.replace(
'𒌋𒌋', '𒎙').replace(
'𒈨𒌍', '𒎌').replace(
'𒊩𒌆', '𒎏').replace(
'𒉌𒌓', '𒎎').replace(
'𒊩𒆪', '𒎐').replace(
'𒈪𒉭', '𒍼').replace(
'𒉌𒂟', '𒍴').replace(
'𒄭𒄊', '𒍽')
# Disunification of ŠAR₂ 𒊹 and TI₂ 𒎗.
if meszl == '633':
sign = '𒎗'
# Disunification of ERIM 𒂟 and PIR₂ 𒎕.
if meszl == '613':
sign = '𒎕'
sign = sign.replace('𒅗 x 𒌅', '𒎆')
sign = sign.replace('𒅗 x 𒌫', '𒎇')
sign = sign.replace('𒅗 x 𒉺', '𒎄')
sign = sign.replace('𒅗 x 𒄑', '𒎀')
sign = sign.replace('𒅗 x 𒄯', '𒎂')
sign = sign.replace('𒅗 x 𒐋', '𒍿')
sign = sign.replace('𒅗 x 𒈝', '𒎃')
sign = sign.replace('𒈹 x 𒍝', '𒎍')
sign = sign.replace('𒊕 x 𒅊', '𒎖')
sign = sign.replace('𒀊 x 𒉣', '𒍰')
sign = sign.replace('𒁾 x 𒊺', '𒍶')
sign = sign.replace('𒂡 x 𒄞', '𒍷')
sign = sign.replace('𒂡 x 𒊺', '𒍸')
sign = sign.replace('𒉒 x 𒁄', '𒎑')
sign = sign.replace('𒉒 x 𒄀', '𒎒')
sign = sign.replace('𒂷 x 𒀭𒆕𒀀', '𒍹')
sign = sign.replace('𒂷 x 𒀾', '𒍺')
sign = sign.replace('𒁖𒆨 x 𒌑𒈦', '𒍳')
sign = sign.replace('𒌝 x 𒈨', '𒎘')
sign = sign.replace('𒈕 x 𒁁', '𒎉')
sign = sign.replace('𒇽 x 𒋗', '𒎋')
sign = sign.replace('𒀖 x 𒀀', '𒍱')
sign = sign.replace('𒀫 x 𒆬', '𒍲')
sign = sign.replace('𒆸 x 𒄀', '𒎈')
if sign == '𒀀𒀁':
sign = '𒀁' # Typo.
# TODO(egg): Add the reading ešelal for 𒈀𒇲, and the alternative sign 𒎊.
# See the extensive discussion of KAM₂ vs. KAMᵛ above.
sign = sign.replace('𒆚', '𒄰')
if meszl == '254':
sign = '𒆚'
# TODO(egg): investigate 𒌗 vs. 𒌚 for ITI, including in other signs.
# Unicode has three signs DUN₃ 𒂅, DUN₃ gunû 𒂆, DUN₃ gunû gunû 𒂇; the
# reference glyphs match the descriptions, they are increasingly gunûd.
# In neo-Assyrian (or indeed in old Assyrian or old Babylonian) these
# correspond to two signs, GIN₂ (which has the reading dun₃), and MIR,
# where MIR=GIN₂ gunû (Borger 556).
# Šašková assumes that the code point for dun₃(GIN₂) is DUN₃ 𒂅,
# therefore that MIR = DUN₃ gunû 𒂆, and has no idea what to make of
# DUN₃ gunû gunû 𒂇.
# Looking at Labat is enlightening. The entry 347 for MIR shows two
# precursor classical sumerian glyphs, one of which is LAK 667 (resembling
# the reference glyph for 𒂆), and the other one a seemingly unrelated
# LAK 154; from LAK 667 Labat has an arrow redirecting to entry 595, while
# LAK 154 morphs into something related to 𒂆 and becomes MIR, one of
# whose old Babylonian glyphs is the reference glyph for 𒂇.
# Meanwhile at entry 595 (TUN₃), Labat gives two precursor glyphs
# resembling the reference glyphs for 𒂅 and 𒂆 (LAK 666 and 667),
# merging into the latter in Assyrian and Babylonian.
# It therefore appears that:
# — LAK 666 is encoded as 𒂅;
# — LAK 667 is encoded as 𒂆 = LAK 666 gunû;
# — LAK 154 is encoded as 𒂇 = LAK 667 gunû;
# — LAK 666 and LAK 667 merge (with the glyph of LAK 667);
# — the result of this merger is read dun₃ in neo-Assyrian, but it looks
# like DUN₃ gunû.
# We thus get MIR = 𒂇 rather than 𒂆, but the readings of GIN₂ have to
# be split between DUN₃ 𒂅 and DUN₃ gunû 𒂆 (which will have the same
# glyph any Assyrian or Babylonian font).
# The conventions used by Oracc are consistent with the above analysis.
# The splitting of readings between 𒂅 and 𒂆 is largely a matter of
# sumerology; we defer to Oracc without further investigation.
#
# Šašková consistently uses 𒂆 for MIR, replace that by 𒂇.
sign = sign.replace('𒂆', '𒂇')
# Same for a composite sign.
sign = sign.replace('𒂧', '𒂨')
# Use 𒂆 wherever Šašková uses 𒂅, we will disunify them below.
sign = sign.replace('𒂅', '𒂆')
# Now that we use the correct sign for GIN₂, we have a sign for EZEN×GIN₂.
sign = sign.replace('𒂡 x 𒂆', '𒂧')
# Do not decompose 𒁃 nor 𒀷.
sign = sign.replace('𒂁𒋡𒁓', '𒁃')
sign = sign.replace('𒀀𒌅𒃮𒇺', '𒀷')
identical_alternatives = re.match('^([^\0-\ff]*)(,\n|\nor\n)\\1$', sign)
if ('𒁃' in sign or '𒀷' in sign) and identical_alternatives:
sign = identical_alternatives.groups()[0]
if row[2].startswith('GE22\n'):
sign = '𒍻'
if meszl == '730':
sign = sign.split('\nold\n')[0]
if meszl == '735':
sign = sign.split('\nnewer\n')[0]
if row[2].startswith('PEŠ2v\n'):
sign = '𒎔'
if row[2].startswith('PEŠ2\n'):
sign = '𒉾'
if meszl == '757':
sign = '𒇉' # ZIKUM = ENGUR.
if meszl == '870':
sign = '𒋙𒀭'
if not sign or any(is_printable_basic_latin(c) for c in sign):
raise ValueError('sign = "%s", in row %s' % (sign, row))
first_reading = Reading(sign, row_index)
first_reading.value = row[2].split('\n')[0]
if sign == '𒇽𒇽' and first_reading.value == 'LU2 over LU2':
# Not encoded, same reading as LU2.LU2 which is in the list.
continue
sign_readings = [first_reading]
current_reading = first_reading
for c in readings:
processed_readings += c
if depth == 1 and c in ',;':
current_reading = Reading(sign, row_index)
sign_readings.append(current_reading)
continue # Consume delimiters between comments.
if c == '(':
depth += 1
if depth in (1, 2):
continue # Consume the initial & start-of-comment parentheses.
elif c == ')':
depth -= 1
if depth in (0, 1):
continue # Consume the final & end-of-comment parentheses.
if depth == 1:
if current_reading is first_reading:
current_reading = Reading(sign, row_index)
sign_readings.append(current_reading)
current_reading.value += c
if current_reading.comment:
raise ValueError(
'Reading %s restarts after comment %s [MesZL %s]' % (
current_reading.value, current_reading.comment, meszl))
elif depth > 1:
current_reading.comment += c
else:
raise ValueError('surfaced before end of readings: %s[!] %r' % (processed_readings, row))
if depth != 0:
raise ValueError('depth=%d at end of readings %r' % (depth, row))
for reading in sign_readings:
reading.normalize()
# We handle numbers ourselves, and thus discard any numerical readings
# found in Šašková.
sign_readings = [
reading for reading in sign_readings
if any (c.isalpha() for c in reading.value)]
# Deal with the disunification of 60 and 1 in Unicode.
for reading in sign_readings:
# Readings given for 60 in MesZL 748.
if reading.sign == '𒁹' and reading.value in ('GEŠ2', 'GIŠ2', 'GEŠTA'):
reading.sign = '𒐕'
# Labat-only readings for 60n.
if reading.sign == '𒐊' and reading.value == 'GEŠIA':
reading.sign = '𒐙'
if reading.sign == '𒐋' and reading.value == 'GEŠAŠ':
reading.sign = '𒐚'
if reading.sign in '𒐌𒑂' and reading.value == 'GEŠUMUN':
reading.sign = '𒐛'
if reading.sign in '𒐍𒑄' and reading.value == 'GEŠUSSU':
reading.sign = '𒐜'
if reading.sign == '𒑆' and reading.value == 'GEŠILIMMU':
reading.sign = '𒐝'
if '𒂆' in reading.sign and all(is_composition_character(c.lower())
for c in reading.value):
try:
reading.sign = reading.sign.replace('𒂆',
DUN3_VARIANTS[reading.value])
except KeyError as e:
print(', '.join(unicodedata.name(c).replace('CUNEIFORM SIGN ', '')
for c in reading.sign),
file=sys.stderr)
raise
for reading in sign_readings:
readings_by_value.setdefault(reading.value, []).append(reading)
readings_by_sign.setdefault(reading.sign, []).append(reading)
ok_entries += 1
# Insert the numbers which we listed ourselves.
for sign, compositions in numbers.compositions_by_sign.items():
for composition in compositions:
reading = Reading(sign, šašková_index=None)
reading.value = composition
readings_by_value.setdefault(reading.composition, []).append(reading)
readings_by_sign.setdefault(reading.sign, []).append(reading)
# Punctuation and common determinatives.
for sign, compositions in {
# MesZL 592.
'𒑱' : [':'],
# MesZL 576: Trennungszeichen (wie n592; Umschrift :). Disunified from GAM
# in Unicode.
'𒑲' : [':v1'],
# MesZL 577: Trennungs- und Wiederholungszeichen (Umschrift mit Parpola,
# LASEA pXX ⫶). Disunified from ILIMMU4 in Unicode.
'𒑳' : ['⫶'],
# Word divider. See MesZL 748, p. 418: In Kültepe wird ein senkrechter Keil
# als Worttrenner gebraucht. Disunified from DIŠ in Unicode.
# See AAA 1/3, 01 for an example usage:
# https://cdli.ucla.edu/search/archival_view.php?ObjectID=P360975.
# We use the transcription convention from CDLI, a forward slash.
'𒑰' : ['/'],
# Determinatives for personal names and gods.
'𒁹' : ['m'],
'𒊩' : ['f'],
'𒀭' : ['d'],
}.items():
for composition in compositions:
reading = Reading(sign, šašková_index=None)
reading.value = composition
readings_by_value.setdefault(reading.composition, []).append(reading)
readings_by_sign.setdefault(reading.sign, []).append(reading)
readings_by_composition = {}
def recompute_readings_by_composition():
readings_by_composition.clear()
for readings in readings_by_sign.values():
for reading in readings:
readings_by_composition.setdefault(reading.composition(), []).append(reading)
def sign_name(sign):
return readings_by_sign[sign][0].value
def print_readings(value, readings, by_source=False):
print(value, file=sys.stderr)
for reading in readings:
print(' ', reading.source.ljust(6) if by_source else ('...' + reading.disambiguator.ljust(8)),
reading.sign, sign_name(reading.sign), 8 * ' ', reading.comment, file=sys.stderr)
for value, readings in readings_by_value.items():
if len(readings) > 1:
# Duplicates, with inconsistent duplicates explicitly listed.
for reading in readings:
if not reading.keep:
continue
for other in readings:
if other.keep and other.sign == reading.sign and other is not reading:
if (other.keep and
((other.comment and reading.comment and other.comment != reading.comment) or
(other.source and reading.source and other.source != reading.source)) and
(value, sign_name(reading.sign)) not in (
# One entry is a superset of the other.
('IL', 'AL x ŠE'),
# The comments on these Labat readings are inconsistent
# (MesZL: AŠLAG missing vs. MesZL: AŠLAG = TUG2.UD), the
# latter being right.
('AŠLAG', 'GIŠ.TUG2.PI.KAR'),
# MesZL and Labat readings in agreement, with a ? from MesZL.
('GAMBI', 'MUNUS.UŠ.DI'),
# MesZL 905 and 906 unified in Unicode (as in Labat).
('MUR7', 'SIG4'),
# Duplicate entries for variants of TA×ḪI unified by Unicode
# as 𒋭. They differ only by their comment.
('ALAMMUŠ', 'LAL3'),
('ALAMUŠ', 'LAL3'),
)):
print_readings(value, readings, by_source=True)
raise ValueError('Inconsistent duplicate readings')
other.keep = False
# Ambiguous readings coming from inconsistency between sign lists.
if any(reading.source and reading.source != 'MesZL' for reading in readings):
for reading in readings:
if not reading.source:
implicit_meszl = any(
re.match(
other.comment,
'MesZL: (\w+, *)*%s(, *\w+)* = %s' % (value, readings_by_sign[reading.sign][0].value))
for other in readings)
if implicit_meszl:
reading.source = 'MesZL'
else:
print_readings(value, readings, by_source=True)
raise ValueError("Divergent readings with undetermined source")
if not all(reading.source == readings[0].source for reading in readings):
for reading in readings:
reading.disambiguator += reading.source[0]
for reading_dict in (readings_by_sign,
readings_by_value):
filtered_dict = {
key: [reading for reading in readings if reading.keep]
for key, readings in reading_dict.items()
}
reading_dict.clear()
reading_dict.update(filtered_dict)
recompute_readings_by_composition()
for readings in readings_by_composition.values():
if len(readings) > 1:
readings.sort(key=lambda r: r.šašková_index)
i = 0
for reading in readings:
if i:
reading.disambiguator += 'v%d' % i
i += 1
recompute_readings_by_composition()
for composition, readings in readings_by_composition.items():
if len(readings) > 1:
print_readings(composition, readings)
raise ValueError('Ambiguous composition')
# Sanity check of numbers: 1meow and meow must map to the same sign.
for composition, readings in readings_by_composition.items():
if re.match('^1\D', composition):
if composition[1:] in readings_by_composition:
if readings[0].sign != readings_by_composition[composition[1:]][0].sign:
if composition in ('1iku', '1buru'):
# Borger gives iku as a reading for 𒃷 in 𒀸𒃷. Friberg sees that as
# a determinative, and transcribes it 1iku GAN2. Shrug.
# Buru seems wtf.
continue
print_readings(composition, readings)
print_readings(composition[1:], readings_by_composition[composition[1:]])
raise ValueError('Inconsistent numeric readings')
for composition, readings in readings_by_composition.items():
if (not all(is_composition_character(c.lower()) for c in composition) or
composition.startswith('x')):
# TODO(egg): composition.startswith('x') is a cheesy way to eliminate xv,
# which happens to be the only reading wherein x is not ₓ at this point.
continue
print('"%s"="%s"' % (composition, readings[0].sign))