Files
2014-04-25 23:37:08 +12:00

124 lines
3.2 KiB
Python

#!/usr/bin/env python
#coding=utf8
"""
Join dic files to make a dic file for hunspell.
USAGE: join_dicts.py [options] <file/directory>...
DESCRIPTION
Join dic files to make a dic file for hunspell.
ARGS:
-d dictionaries to combine
EXAMPLE: python join_dicts.py -d en_GB_geo.dic en_NZ_geo_oil_maori.dic
VERSION
$Id$
"""
"""
TOOLS
http://marcoagpinto.cidadevirtual.pt/proofingtoolgui.html An Open Source tool coded in PureBasic for editing the Dictionary/Thesaurus/Hyphenation of OpenOffice/LibreOffice/Firefox/Thunderbird
http://latexeditor.org/credits.html
"""
import sys, os
import argparse
def skeleton(input):
"""
The main function of this program.
"""
# read in dics
# remove silly lines
# sort
# check for encoding?
# write count as first line
# write lines
files=os.listdir(input)
input=[]
for i in files:
if os.path.isfile(i):
if os.path.splitext(i)[1]=='.dic':
input.append(i)
if os.path.splitext(i)[1]=='.txt':
input.append(i)
def remove_values_from_list(the_list, val):
return [value for value in the_list if value != val]
def remove_dups(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
def remove_non_unicode(seq):
seen = set()
seen_add = seen.add
newseq=[]
for v in seq:
try:
newseq.append(unicode(v,'utf-8'))
except UnicodeDecodeError:
pass
#except UnicodeEncodeError:
#pass
return newseq
print "Dictionarys to combine: ", input
wordlist=[]
# open them all
for obj in input:
lines=open(obj,'r').readlines()
if lines[0].strip().isdigit():
lines=lines[1:] # remove the first lien which is a wordcount in dic files
wordlist.extend(lines)
print "Uncleaned wordlist is", len(wordlist)
wordlist.sort() # sort the content
wordlist=remove_values_from_list(wordlist,'\n')
wordlist=remove_dups(wordlist)
wordlist=remove_non_unicode(wordlist)
len_rows=str(len(wordlist))+'\n' # get length
import codecs
outfile=codecs.open(args.out,'w','UTF-8')
outfile.write(len_rows)
#from django.utils.encoding import smart_str
for word in wordlist:
outfile.write((word))
print "Output words", len_rows
outfile.close()
#skeleton
def main():
"""
Main entry point.
Note that the input file can be specified as - in which case standard
input is used. Passing the -h argument causes a help text to be printed.
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__.split('\n\n\n')[0],
epilog='You can add some extra information about the arguments here.')
parser.add_argument('-o','--out', default=sys.stdout)
parser.add_argument('dics',type=str, help='directory with text files or dics to be combines')
global args
args = parser.parse_args()
skeleton(args.dics)
if __name__ == '__main__':
main()