update-icu4x.sh (4591B)
1 #!/bin/sh 2 # This Source Code Form is subject to the terms of the Mozilla Public 3 # License, v. 2.0. If a copy of the MPL was not distributed with this 4 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 5 6 set -e 7 8 # Update the icu4x binary data for a given release: 9 # Usage: update-icu4x.sh <URL of ICU GIT> <release tag name> <CLDR version> <ICU release tag name> <ICU4X version of icu_capi> 10 # update-icu4x.sh https://github.com/unicode-org/icu4x.git icu@1.5.0 45.0.0 release-75-1 1.5.0 11 # 12 # Update to the main branch: 13 # Usage: update-icu4x.sh <URL of ICU GIT> <branch> <CLDR version> <ICU release tag name> <ICU4X version of icu_capi> 14 # update-icu4x.sh https://github.com/unicode-org/icu4x.git main 45.0.0 release-75-1 1.5.0 15 16 # default 17 cldr=${3:-47.0.0} 18 icuexport=${4:-release-77-1} 19 icu4x_version=${5:-2.0.0} 20 21 if [ $# -lt 2 ]; then 22 echo "Usage: update-icu4x.sh <URL of ICU4X GIT> <ICU4X release tag name> <CLDR version> <ICU release tag name> <ICU4X version for icu_capi>" 23 echo "Example: update-icu4x.sh https://github.com/unicode-org/icu4x.git icu@2.0.0 47.0.0 release-77-1 2.0.0" 24 exit 1 25 fi 26 27 # Make a log function so the output is easy to read. 28 log() { 29 CYAN='\033[0;36m' 30 CLEAR='\033[0m' 31 printf "${CYAN}[update-icu4x]${CLEAR} $*\n" 32 } 33 34 # Specify locale and time zone information for consistent output and reproduceability. 35 export TZ=UTC 36 export LANG=en_US.UTF-8 37 export LANGUAGE=en_US 38 export LC_ALL=en_US.UTF-8 39 40 # Define all of the paths. 41 original_pwd=$(pwd) 42 top_src_dir=$(cd -- "$(dirname "$0")/.." >/dev/null 2>&1 ; pwd -P) 43 segmenter_data_dir=${top_src_dir}/intl/icu_segmenter_data/data 44 git_info_file=${segmenter_data_dir}/ICU4X-GIT-INFO 45 46 log "Remove the old data" 47 rm -rf ${segmenter_data_dir} 48 49 log "Download icuexportdata" 50 tmpicuexportdir=$(mktemp -d) 51 icuexport_filename=`echo "icuexportdata_${icuexport}.zip" | sed "s/\//-/g"` 52 cd ${tmpicuexportdir} 53 wget https://github.com/unicode-org/icu/releases/download/${icuexport}/${icuexport_filename} 54 55 log "Patching icuexportdata to reduce data size" 56 unzip ${icuexport_filename} 57 for toml in \ 58 burmesedict.toml \ 59 khmerdict.toml \ 60 laodict.toml \ 61 thaidict.toml \ 62 ; do 63 cp ${top_src_dir}/intl/icu4x-patches/empty.toml ${tmpicuexportdir}/segmenter/dictionary/$toml 64 done 65 66 log "Clone ICU4X" 67 tmpclonedir=$(mktemp -d) 68 git clone --depth 1 --branch $2 $1 ${tmpclonedir} 69 70 log "Change the directory to the cloned repo" 71 log ${tmpclonedir} 72 cd ${tmpclonedir} 73 74 log "Copy icu_capi crate to local since we need a patched version" 75 rm -rf ${top_src_dir}/intl/icu_capi 76 wget -O icu_capi.tar.gz https://crates.io/api/v1/crates/icu_capi/${icu4x_version}/download 77 tar xf icu_capi.tar.gz -C ${top_src_dir}/intl 78 mv ${top_src_dir}/intl/icu_capi-${icu4x_version} ${top_src_dir}/intl/icu_capi 79 rm -rf icu_capi_tar.gz 80 81 log "Patching icu_capi" 82 for patch in \ 83 001-Cargo.toml.patch \ 84 ; do 85 patch -d ${top_src_dir} -p1 --no-backup-if-mismatch < ${top_src_dir}/intl/icu4x-patches/$patch 86 done 87 88 # ICU4X 1.3 or later with icu_capi uses each compiled_data crate. 89 90 log "Run the icu4x-datagen tool to regenerate the segmenter data." 91 log "Saving the data into: ${segmenter_data_dir}" 92 93 # TODO(Bug 1741262) - Should locales be filtered as well? It doesn't appear that the existing ICU 94 # data builder is using any locale filtering. 95 96 # --keys <KEYS>... 97 # Include this resource key in the output. Accepts multiple arguments. 98 # --key-file <KEY_FILE> 99 # Path to text file with resource keys to include, one per line. Empty lines and 100 # lines starting with '#' are ignored. 101 cargo run --bin icu4x-datagen \ 102 -- \ 103 --cldr-tag ${cldr} \ 104 --icuexport-root ${tmpicuexportdir} \ 105 -m SegmenterBreakGraphemeClusterV1 \ 106 -m SegmenterBreakLineV1 \ 107 -m SegmenterBreakSentenceV1 \ 108 -m SegmenterBreakSentenceOverrideV1 \ 109 -m SegmenterBreakWordV1 \ 110 -m SegmenterBreakWordOverrideV1 \ 111 -m SegmenterLstmAutoV1 \ 112 -m SegmenterDictionaryAutoV1 \ 113 -m SegmenterDictionaryExtendedV1 \ 114 --locales full \ 115 --format baked \ 116 --out ${segmenter_data_dir} \ 117 118 log "Record the current cloned git information to:" 119 log ${git_info_file} 120 # (This ensures that if ICU modifications are performed properly, it's always 121 # possible to run the command at the top of this script and make no changes to 122 # the tree.) 123 git -C ${tmpclonedir} log -1 > ${git_info_file} 124 125 log "Clean up the tmp directory" 126 cd ${original_pwd} 127 rm -rf ${tmpclonedir} 128 rm -rf ${tmpicuexportdir}