tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

edit-dictionary.sh (2768B)


      1 #! /usr/bin/env sh
      2 
      3 # This Source Code Form is subject to the terms of the Mozilla Public
      4 # License, v. 2.0. If a copy of the MPL was not distributed with this
      5 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      6 
      7 set -e
      8 
      9 WKDIR="`pwd`"
     10 SPELLER="$WKDIR/scowl/speller"
     11 
     12 munch() {
     13  $SPELLER/munch-list munch $1 | sort -u
     14 }
     15 
     16 expand() {
     17  grep -v '^[0-9]\+$' | $SPELLER/munch-list expand $1 | sort -u
     18 }
     19 
     20 if [ ! -d "$SPELLER" ]; then
     21  echo "The 'scowl' folder is missing. Check the documentation at"
     22  echo "https://firefox-source-docs.mozilla.org/extensions/spellcheck/index.html"
     23  exit 1
     24 fi
     25 
     26 if [ -z "$EDITOR" ]; then
     27  echo 'Need to set the $EDITOR environment variable to your favorite editor.'
     28  exit 1
     29 fi
     30 
     31 # Open the editor and allow the user to type or paste words
     32 echo "Editor is going to open, you can add the list of words. Quit the editor to finish editing."
     33 echo "Press Enter to begin."
     34 read foo
     35 $EDITOR temp-list.txt
     36 
     37 if [ ! -f temp-list.txt ]; then
     38  echo "The content of the editor hasn't been saved."
     39  exit 1
     40 fi
     41 # Remove empty lines
     42 sed -i "" "/^$/d" temp-list.txt
     43 
     44 # Copy the current en-US dictionary and strip the first line that contains
     45 # the count.
     46 tail -n +2 ../en-US.dic > en-US.stripped
     47 
     48 # Convert the file to UTF-8
     49 iconv -f iso-8859-1 -t utf-8 en-US.stripped > en-US.utf8
     50 rm en-US.stripped
     51 
     52 # Save to a temporary file words excluded from suggestions, and numerals,
     53 # since the munched result is different for both.
     54 grep '!$' < utf8/en-US-utf8.dic > en-US-nosug.txt
     55 grep '^[0-9][a-z/]' < utf8/en-US-utf8.dic > en-US-numerals.txt
     56 
     57 # Expand the dictionary to a word list
     58 expand ../en-US.aff < en-US.utf8 > en-US-wordlist.txt
     59 rm en-US.utf8
     60 
     61 # Add the new words
     62 cat temp-list.txt >> en-US-wordlist.txt
     63 rm temp-list.txt
     64 
     65 # Remove numerals from the expanded wordlist
     66 grep -v '^[0-9]' < en-US-wordlist.txt > en-US-wordlist-nonum.txt
     67 rm en-US-wordlist.txt
     68 
     69 # Run the wordlist through the munch script, to compress the dictionary where
     70 # possible (using affix rules).
     71 munch ../en-US.aff < en-US-wordlist-nonum.txt > en-US-munched.dic
     72 rm en-US-wordlist-nonum.txt
     73 
     74 # Remove words that should not be suggested
     75 while IFS='/' read -ra line
     76 do
     77  sed -E -i "" "\:^$line($|/.*):d" en-US-munched.dic
     78 done < "en-US-nosug.txt"
     79 
     80 # Add back suggestion exclusions and numerals from the original .dic file
     81 cat en-US-nosug.txt >> en-US-munched.dic
     82 cat en-US-numerals.txt >> en-US-munched.dic
     83 rm en-US-nosug.txt
     84 rm en-US-numerals.txt
     85 
     86 # Add back the line count and sort the lines
     87 wc -l < en-US-munched.dic | tr -d '[:blank:]' > en-US.dic
     88 LC_ALL=C sort en-US-munched.dic >> en-US.dic
     89 rm -f en-US-munched.dic
     90 
     91 # Convert back to ISO-8859-1
     92 iconv -f utf-8 -t iso-8859-1 en-US.dic > ../en-US.dic
     93 
     94 # Keep a copy of the UTF-8 file in /utf8
     95 mv en-US.dic utf8/en-US-utf8.dic