Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 29 additions & 16 deletions scriptshifter/tables/data/_ignore_base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,24 @@ general:
roman_to_script:
ignore:
- "At head of title"
- "at head of title"
- "Colophon"
- "colophon"
- "Colophon"
- "Cover title"
- "On cover"
- "S.l."
- "Spine title"
- "and one other"
- "at head of title"
- "colophon"
- "cover title"
- "date of publication not identified"
- "et al."
- "on cover"
- "place of publication not identified"
- "publisher not identified"
- "and one other"
- "and others"
- "et al."
- "s.l."
- "s.n."
- "spine title"
ignore_ptn:
- "and ([a-z0-9]+ )?others"

Expand All @@ -29,17 +37,22 @@ roman_to_script:
# dedicated U+2160÷U+216F (uppercase Roman
# numerals) and/or U+2170÷U+217F (lower case Roman
# numerals) ranges to avoid this ambiguity.
- "I{2,3}\\b"
- "I(V|X)\\b"
- "LI{,3}\\b"
- "LI?(V|X)\\b"
- "L(V|X{1,3})I{,3}\\b"
- "LX{1,3}I?V\\b"
- "LX{1,3}VI{,3}\\b"
- "VI{1,3}\\b"
- "X{1,3}I{1,3}\\b"
- "X{1,3}I(V|X)\\b"
- "X{1,3}VI{,3}\\b"
- "M{,3}(CM)?C?D?C{1,3}L?X{,3}I{,3}\\b"
- "M{1,3}(CM)?C?D?C{,3}L?X{,3}I{,3}\\b"
- "M{,3}(CM)?C?D?C{1,3}L?X{,3}I[VX]\\b"
- "M{1,3}(CM)?C?D?C{,3}L?X{,3}I[VX]\\b"

# NMay not be prefixed by M, D, C, L. Cannot use for single digits.
- "M{,3}(CM)?C?D?C{,3}I(I{,2}V|X)\\b"
- "M{,3}(CM)?C?D?C{,3}LI{1,3}\\b"
- "M{,3}(CM)?C?D?C{,3}LI?[VX]\\b"
- "M{,3}(CM)?C?D?C{,3}L(V|X{1,3})I{,3}\\b"
- "M{,3}(CM)?C?D?C{,3}LX{1,3}I?[VX]\\b"
- "M{,3}(CM)?C?D?C{,3}LX{1,3}VI{,3}\\b"
- "M{,3}(CM)?C?D?C{,3}VI{1,3}\\b"
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?I{1,3}\\b"
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?I[VX]\\b"
- "M{,3}(CM)?C?D?C{,3}X{1,3}C?VI{,3}\\b"

# MARC sub-field markers.
- "[\u2021\u01C2\\$][0-9a-z]\\b"
327 changes: 327 additions & 0 deletions scriptshifter/tables/data/arabic 2025-12-01.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
# Arabic S2R using the 3rd-party ArabicTransliterator library:
# https://github.com/MTG/ArabicTransliterator

---
general:
name: Arabic
parents:
- _ignore_base
description: >
Version 1.0 (2025-11-29) - Arabic language R2S using a conversion table; S2R using a 3rd party library.
case_sensitive: false

roman_to_script:
map:

# Punctuation marks:
"*": "\u066D"
",": "\u060C"
";": "\u061B"
"?": "\u061F"

# Exceptions for specific words

# Allah
"%alla\u0304h%": "\uFDF2"
"alla\u0304h": "\u0627\u0644\u0644\u0647"

# Qur'an
"qur\u02BCa\u0304n": "\u0642\u0631\u0622\u0646"

# lillah
"lilla\u0304h": "\u0644\u0644\u0647"

# billah
"billa\u0304h": "\u0628\u0644\u0644\u0647"

# Rahman
"rah\u0323ma\u0304n": "\u0631\u062D\u0645\u0646"

# Ruwat
"ruwa\u0304t": "\u0631\u0648\u0627\u0629"

# Hadha
"ha\u0304dha\u0304": "\u0647\u0630\u0627"

# Hadhihi
"ha\u0304dhi\u0304hi": "\u0647\u0630\u0647"

# dhalika
"dha\u0304lika": "\u0630\u0644\u0643"

# Ibn when it appears in the middle of a name sequence
"ibn": "\u0628\u0646"

# H[dot below]aya[macron]t
"h\u0323aya\u0304t": "\u062D\u064A\u0627\u0629"

# "sh[dot below] as in "Ishaq"
"sh\u0323": "\u0633\u062D"

# "s[prime]h" combos
"s\u02B9h": "\u0633\u0647"

# "th[dot below]"
"th\u0323": "\u062A\u062D"

# dh[dot under]
"dh\u0323": "\u062F\u062D"

# La-hu
"la-hu": "\u0644\u0647"

# Mi'ah
"mi\u02BEah": "\u0645\u0627\u0626\u0629"
"mi\u02BCah": "\u0645\u0627\u0626\u0629"

# Mi'at
"mi\u02BEat": "\u0645\u0627\u0626\u0629"
"mi\u02BCat": "\u0645\u0627\u0626\u0629"

# Numbers (I have set these to Hindi numbers. Note that Persian and Urdu
# will technically use \u06F0-06F9. This needs further discussion with PSD
# as RLIN21 used Hindi numbers, Connexion and Voyager does not.)

# Edition statements with Latin number
"al-t\u0323ab\u02BBah 1": "\u0627\u0644\u0637\u0628\u0639\u0629 1"
"al-t\u0323ab\u02BBah 2": "\u0627\u0644\u0637\u0628\u0639\u0629 2"
"al-t\u0323ab\u02BBah 3": "\u0627\u0644\u0637\u0628\u0639\u0629 3"
"al-t\u0323ab\u02BBah 4": "\u0627\u0644\u0637\u0628\u0639\u0629 4"
"al-t\u0323ab\u02BBah 5": "\u0627\u0644\u0637\u0628\u0639\u0629 5"
"al-t\u0323ab\u02BBah 6": "\u0627\u0644\u0637\u0628\u0639\u0629 6"
"al-t\u0323ab\u02BBah 7": "\u0627\u0644\u0637\u0628\u0639\u0629 7"
"al-t\u0323ab\u02BBah 8": "\u0627\u0644\u0637\u0628\u0639\u0629 8"
"al-t\u0323ab\u02BBah 9": "\u0627\u0644\u0637\u0628\u0639\u0629 9"

# Use Basic Arabic-Indic \u0660-0669
"0": "\u0660"
"1": "\u0661"
"2": "\u0662"
"3": "\u0663"
"4": "\u0664"
"5": "\u0665"
"6": "\u0666"
"7": "\u0667"
"8": "\u0668"
"9": "\u0669"

# Hyphenated prefixes:
"wa-": "\u0648"
"bi-": "\u0628"
"al-": "\u0627\u0644"
"lil-": "\u0644\u0644"
"li-": "\u0644"
"la\u0304-": "\u0644"
"fi\u0304-": "\u0641\u064A"
"ka-": "\u0643"

# Vowels and vowel/consonant combinations - ta-marbutah at end of word
"ah%": "\u0629"
"at%": "\u0629"

# tanwin at end of word
"an%": "\u0627"

# ayn-alif combo
"\u02BBa\u0304\u02BE%": "\u0639\u0627\u0621"
"\u02BBa\u0304\u02BC%": "\u0639\u0627\u0621"

"\u02BBa\u0304": "\u0639\u0627"

"\u02BBi\u0304y": "\u0639\u064A"
"\u02BBi\u0304": "\u0639\u064A"

"\u02BBu\u0304": "\u0639\u0648"
"\u02BBu": "\u0639"

"%\u02BBa": "\u0639"
# "\u02BBa%": "\u0639"

# alif and hamzas for all occasions

# truncation necessary? It seems to work fine with.

"i\u0304\u02BEah%": "\u064A\u0626\u0629"
"i\u0304\u02BCah%": "\u064A\u0626\u0629"

"i\u0304\u02BEat%": "\u064A\u0626\u0629"
"i\u0304\u02BCat%": "\u064A\u0626\u0629"

"i\u02BEa\u0304%": "\u0626\u0627"
"i\u02BCa\u0304%": "\u0626\u0627"

"i\u02BE": "\u0626%"
"i\u02BC": "\u0626%"
"a\u0304\u02BEa\u0304": "\u0627\u0621\u0627"
"a\u0304\u02BCa\u0304": "\u0627\u0621\u0627"

"a\u02BE": "\u0623"
"a\u02BC": "\u0623"
"\u02BEi": "\u0626"
"\u02BCi": "\u0626"
"\u02BEa\u0304": "\u0622"
"\u02BCa\u0304": "\u0622"
"\u02BEa": "\u0623"
"\u02BCa": "\u0623"

"y\u02BCah": "\u064A\u0626\u0629"
"y\u02BEah": "\u064A\u0626\u0629"

"y\u02BCat": "\u064A\u0626\u0629"
"y\u02BEat": "\u064A\u0626\u0629"

# A

"a\u0304\u02BCi\u0304": "\u0627\u0626\u064A"
"a\u0304\u02BEi\u0304": "\u0627\u0626\u064A"

"a\u0304\u02BCi": "\u0627\u0626"
"a\u0304\u02BEi": "\u0627\u0626"
"a\u0304\u02BC": "\u0627\u0621"
"a\u0304\u02BE": "\u0627\u0621"
"%a\u0304": "\u0622"
"a\u0304": "\u0627"

# These next two lines were intended to convert to alif-ayn when it is at
# # the beginning of a word, definite or indefinine (i.e.
# al-a[ayn]ma[macron]l or [space]a[ayn]ma[macron]l"
"%a\u02BB": "\u0623\u0639"
"a\u02BB": "\u0639"
"a\u0301": "\u0649"

"ayy": "\u064A"
"%a": "\u0623"
"a": ""

# I - Capital I at beginning of word is usually alif hamzah-below.

"%i\u0304": "\u064A"
"i\u0304y": "\u064A"
"iy": "\u064A"
"i\u0304": "\u064A"
"%\u02BBi": "\u0639"

# "i\u02BB": "\u0625\u0639"

"i\u02BE": "\u0626"
"i\u02BC": "\u0627\u0626"

"%i": "\u0625"
"i": ""

# U

"u\u0304\u02BE": "\u0624"
"u\u0304\u02BC": "\u0624"
"%u\u0304w": "\u0623\u0648"
"%u\u0304": "\u0623\u0648"
"u\u0304w": "\u0648"
"u\u0304": "\u0648"
"u\u02BE": "\u0624"
"u\u02BC": "\u0624"

"%u": "\u0623"
"u": ""

# Consonants, with tashdid added

"bb": "\u0628"
"b": "\u0628"
"thth": "\u062B"
"th": "\u062B"
"t\u0323t\u0323": "\u0637"
"t\u0323": "\u0637"
"tt": "\u062A"
"t": "\u062A"
"J": "\u062C"
"jj": "\u062C"
"j": "\u062C"
"h\u0323h\u0323": "\u062D"
"h\u0323": "\u062D"
"hh": "\u0647"
"h": "\u0647"
"Kh": "\u062E"
"khkh": "\u062E"
"kh": "\u062E"
"kk": "\u0643"
"k": "\u0643"
"dhdh": "\u0630"
"dh": "\u0630"
"d\u0323d\u0323": "\u0636"
"d\u0323": "\u0636"
"dd": "\u062F"
"d": "\u062F"
"rr": "\u0631"
"r": "\u0631"
"z\u0323z\u0323": "\u0638"
"z\u0323": "\u0638"
"zz": "\u0632"
"z": "\u0632"
"shsh": "\u0634"
"sh": "\u0634"
"s\u0323s\u0323": "\u0635"
"s\u0323": "\u0635"
"ss": "\u0633"
"s": "\u0633"
"ghgh": "\u063A"
"gh": "\u063A"
"ff": "\u0641"
"f": "\u0641"
"qq": "\u0642"
"q": "\u0642"
"ll": "\u0644"
"l": "\u0644"
"mm": "\u0645"
"m": "\u0645"
"nn": "\u0646"
"n": "\u0646"
"ww": "\u0648"
"w": "\u0648"
"yy": "\u064A"
"y": "\u064A"

# non-Arabic consonants:
"p": "\u067E"
"ch": "\u0686"
"v": "\u06A4"
"g": "\u06AF"

# Diacritic characters:
# ain (\u0639) - not transliterated alone:
"\u02BB": "\u0639"
# hamza - not romanized
# "\u0621"
# hamza (alone in final position)
"\u02BE%": "\u0621"
"\u02BC%": "\u0621"

# Do not know what, if anything, is needed here:
# tatweel:
# "\u0640"
# fathatan:
# "\u064B"
# dammatan:
# "\u064C"
# kasratan:
# "\u064D"
# fatha:
# "\u064E"
# damma:
# "\u064F"
# kasra:
# "\u0650"
# shadda:
# "\u0651"
# sukun:
# "\u0652"
# superscript alef:
# "\u0670"
# alef wasla
# "\u0671"


script_to_roman:
hooks:
post_config:
-
- arabic.arabic_romanizer.s2r_post_config
Loading