tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

make-new-dict.sh (5795B)


      1 #! /usr/bin/env sh
      2 
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 # This script creates a new dictionary by expanding the original,
      8 # Mozilla's, and the upstream dictionary to remove affix flags and
      9 # then doing the wordlist equivalent of diff3 to create a new
     10 # dictionary.
     11 #
     12 # The files 2-mozilla-add and 2-mozilla-rem contain words added and
     13 # removed, respectively in the Mozilla dictionary. The final
     14 # dictionary will be in hunspell-en_US-mozilla.zip.
     15 
     16 set -e
     17 
     18 export LANG=C
     19 export LC_ALL=C
     20 export LC_CTYPE=C
     21 export LC_COLLATE=C
     22 
     23 WKDIR="`pwd`"
     24 ORIG="$WKDIR/orig"
     25 SUPPORT_DIR="$WKDIR/support_files"
     26 SPELLER="$WKDIR/scowl/speller"
     27 
     28 # This is required by scowl scripts
     29 export SCOWL="$WKDIR/scowl/"
     30 
     31 expand() {
     32  grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
     33 }
     34 
     35 if [ ! -d "$SPELLER" ]; then
     36  echo "The 'scowl' folder is missing. Check the documentation at"
     37  echo "https://firefox-source-docs.mozilla.org/extensions/spellcheck/index.html"
     38  exit 1
     39 fi
     40 
     41 mkdir -p $SUPPORT_DIR
     42 cd $SPELLER
     43 MK_LIST="../mk-list -v1 --accents=both en_US 60"
     44 cat <<EOF > params.txt
     45 With Input Command: $MK_LIST
     46 EOF
     47 # Note: the output of make-hunspell-dict is UTF-8
     48 $MK_LIST | ./make-hunspell-dict -one en_US-custom params.txt > ./make-hunspell-dict.log
     49 cd $WKDIR
     50 
     51 # Note: Input and output of "expand" is always ISO-8859-1.
     52 #       All expanded word list files are thus in ISO-8859-1.
     53 expand $SPELLER/en.aff < $SPELLER/en.dic.supp > $SUPPORT_DIR/0-special.txt
     54 
     55 # Input is UTF-8, expand expects ISO-8859-1 so use iconv
     56 iconv -f utf-8 -t iso-8859-1 $ORIG/en_US-custom.dic | expand $ORIG/en_US-custom.aff > $SUPPORT_DIR/1-base.txt
     57 
     58 # Store suggestion exclusions (ending with !) defined in current Mozilla dictionary.
     59 # Save both the compressed (munched) and expanded version.
     60 grep '!$' ../en-US.dic > $SUPPORT_DIR/2-mozilla-nosug-munched.txt
     61 expand ../en-US.aff < $SUPPORT_DIR/2-mozilla-nosug-munched.txt > $SUPPORT_DIR/2-mozilla-nosug.txt
     62 
     63 # Remove suggestion exclusions and expand the existing Mozilla dictionary.
     64 # The existing Mozilla dictionary is already in ISO-8859-1.
     65 grep -v '!$' < ../en-US.dic > $SUPPORT_DIR/en-US-nosug.dic
     66 expand ../en-US.aff < $SUPPORT_DIR/en-US-nosug.dic > $SUPPORT_DIR/2-mozilla.txt
     67 rm $SUPPORT_DIR/en-US-nosug.dic
     68 
     69 # Input is UTF-8, expand expects ISO-8859-1 so use iconv
     70 iconv -f utf-8 -t iso-8859-1 $SPELLER/en_US-custom.dic | expand $SPELLER/en_US-custom.aff > $SUPPORT_DIR/3-upstream.txt
     71 
     72 # Suppress common lines and lines only in the 2nd file, leaving words that are
     73 # only available in the 1st file (SCOWL), i.e. were removed by Mozilla.
     74 comm -23 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/2-mozilla.txt > $SUPPORT_DIR/2-mozilla-removed.txt
     75 
     76 # Suppress common lines and lines only in the 1st file, leaving words that are
     77 # only available in the 2nd file (current Mozilla dictionary), i.e. were added
     78 # by Mozilla.
     79 comm -13 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/2-mozilla.txt > $SUPPORT_DIR/2-mozilla-added.txt
     80 
     81 # Suppress common lines and lines only in the 2nd file, leaving words that are
     82 # only available in the 1st file (words from the new upstream SCOWL dictionary).
     83 # The result is upstream, minus the words removed, plus the words added.
     84 comm -23 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-removed.txt | cat - $SUPPORT_DIR/2-mozilla-added.txt | sort -u > $SUPPORT_DIR/4-patched.txt
     85 
     86 # Note: the output of make-hunspell-dict is UTF-8
     87 cat $SUPPORT_DIR/4-patched.txt | comm -23 - $SUPPORT_DIR/0-special.txt | $SPELLER/make-hunspell-dict -one en_US-mozilla /dev/null
     88 
     89 # Add back Mozilla suggestion exclusions. Need to convert the file from
     90 # ISO-8859-1 to UTF-8 first, then add back the line count and reorder.
     91 tail -n +2 en_US-mozilla.dic > en_US-mozilla-complete.dic
     92 iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/2-mozilla-nosug-munched.txt >> en_US-mozilla-complete.dic
     93 wc -l < en_US-mozilla-complete.dic | tr -d '[:blank:]' > en_US-mozilla.dic
     94 LC_ALL=C sort en_US-mozilla-complete.dic >> en_US-mozilla.dic
     95 rm -f en_US-mozilla-complete.dic
     96 
     97 # Sanity check should yield identical results
     98 #comm -23 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/3-upstream.txt > $SUPPORT_DIR/3-upstream-remover.txt
     99 #comm -13 $SUPPORT_DIR/1-base.txt $SUPPORT_DIR/3-upstream.txt > $SUPPORT_DIR/3-upstream-added.txt
    100 #comm -23 $SUPPORT_DIR/2-mozilla.txt $SUPPORT_DIR/3-upstream-removed.txt | cat - $SUPPORT_DIR/3-upstream-added.txt | sort -u > $SUPPORT_DIR/4-patched-v2.txt
    101 
    102 expand ../en-US.aff < mozilla-specific.txt > 5-mozilla-specific.txt
    103 
    104 # Update Mozilla removed and added wordlists based on the new upstream
    105 # dictionary, save them as UTF-8 and not ISO-8951-1.
    106 # Ignore words excluded from suggestions for both files.
    107 comm -12 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-removed.txt > $SUPPORT_DIR/5-mozilla-removed-tmp.txt
    108 comm -23 $SUPPORT_DIR/5-mozilla-removed-tmp.txt $SUPPORT_DIR/2-mozilla-nosug.txt > $SUPPORT_DIR/5-mozilla-removed.txt
    109 rm $SUPPORT_DIR/5-mozilla-removed-tmp.txt
    110 iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/5-mozilla-removed.txt > 5-mozilla-removed.txt
    111 
    112 comm -13 $SUPPORT_DIR/3-upstream.txt $SUPPORT_DIR/2-mozilla-added.txt > $SUPPORT_DIR/5-mozilla-added-tmp.txt
    113 comm -23 $SUPPORT_DIR/5-mozilla-added-tmp.txt $SUPPORT_DIR/2-mozilla-nosug.txt > $SUPPORT_DIR/5-mozilla-added.txt
    114 rm $SUPPORT_DIR/5-mozilla-added-tmp.txt
    115 iconv -f iso-8859-1 -t utf-8 $SUPPORT_DIR/5-mozilla-added.txt > 5-mozilla-added.txt
    116 
    117 # Clean up some files
    118 rm hunspell-en_US-mozilla.zip
    119 rm nosug
    120 
    121 # Remove backup folders in preparation for the install-new-dict script
    122 FOLDERS=( "orig-bk" "mozilla-bk")
    123 for f in ${FOLDERS[@]}; do
    124  if [ -d "$SUPPORT_DIR/$f" ]; then
    125    echo "Removing backup folder $f"
    126    rm -rf "$SUPPORT_DIR/$f"
    127  fi
    128 done