First commit

This commit is contained in:
wassname
2014-04-25 23:37:08 +12:00
parent e94e0a7eeb
commit 06b7292319
6 changed files with 233599 additions and 0 deletions
+123
View File
@@ -0,0 +1,123 @@
#!/usr/bin/env python
#coding=utf8
"""
Join dic files to make a dic file for hunspell.
USAGE: join_dicts.py [options] <file/directory>...
DESCRIPTION
Join dic files to make a dic file for hunspell.
ARGS:
-d dictionaries to combine
EXAMPLE: python join_dicts.py -d en_GB_geo.dic en_NZ_geo_oil_maori.dic
VERSION
$Id$
"""
"""
TOOLS
http://marcoagpinto.cidadevirtual.pt/proofingtoolgui.html An Open Source tool coded in PureBasic for editing the Dictionary/Thesaurus/Hyphenation of OpenOffice/LibreOffice/Firefox/Thunderbird
http://latexeditor.org/credits.html
"""
import sys, os
import argparse
def skeleton(input):
"""
The main function of this program.
"""
# read in dics
# remove silly lines
# sort
# check for encoding?
# write count as first line
# write lines
files=os.listdir(input)
input=[]
for i in files:
if os.path.isfile(i):
if os.path.splitext(i)[1]=='.dic':
input.append(i)
if os.path.splitext(i)[1]=='.txt':
input.append(i)
def remove_values_from_list(the_list, val):
return [value for value in the_list if value != val]
def remove_dups(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
def remove_non_unicode(seq):
seen = set()
seen_add = seen.add
newseq=[]
for v in seq:
try:
newseq.append(unicode(v,'utf-8'))
except UnicodeDecodeError:
pass
#except UnicodeEncodeError:
#pass
return newseq
print "Dictionarys to combine: ", input
wordlist=[]
# open them all
for obj in input:
lines=open(obj,'r').readlines()
if lines[0].strip().isdigit():
lines=lines[1:] # remove the first lien which is a wordcount in dic files
wordlist.extend(lines)
print "Uncleaned wordlist is", len(wordlist)
wordlist.sort() # sort the content
wordlist=remove_values_from_list(wordlist,'\n')
wordlist=remove_dups(wordlist)
wordlist=remove_non_unicode(wordlist)
len_rows=str(len(wordlist))+'\n' # get length
import codecs
outfile=codecs.open(args.out,'w','UTF-8')
outfile.write(len_rows)
#from django.utils.encoding import smart_str
for word in wordlist:
outfile.write((word))
print "Output words", len_rows
outfile.close()
#skeleton
def main():
"""
Main entry point.
Note that the input file can be specified as - in which case standard
input is used. Passing the -h argument causes a help text to be printed.
"""
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=__doc__.split('\n\n\n')[0],
epilog='You can add some extra information about the arguments here.')
parser.add_argument('-o','--out', default=sys.stdout)
parser.add_argument('dics',type=str, help='directory with text files or dics to be combines')
global args
args = parser.parse_args()
skeleton(args.dics)
if __name__ == '__main__':
main()
+1185
View File
File diff suppressed because it is too large Load Diff
+846
View File
@@ -0,0 +1,846 @@
\AA
\SS
\aa
\cr
\dp
\em
\fi
\ht
\if
\or
\wd
\Box
\TeX
\and
\box
\day
\def
\end
\exa
\exb
\exc
\fam
\flq
\frq
\glq
\grq
\hss
\ifx
\let
\lhd
\mag
\mho
\par
\put
\ref
\rhd
\the
\toc
\vss
\Alph
\HUGE
\Huge
\Join
\alph
\atop
\char
\cite
\copy
\crcr
\date
\dots
\else
\emph
\eqno
\fbox
\fill
\font
\frac
\gdef
\glqq
\grqq
\hbox
\hfil
\huge
\item
\kern
\kill
\left
\list
\long
\mark
\math
\mbox
\null
\omit
\oval
\over
\part
\plus
\quad
\read
\rule
\sbox
\show
\skip
\span
\stop
\time
\tiny
\toks
\vbox
\verb
\vfil
\vtop
\xdef
\year
\LARGE
\LaTeX
\Large
\Roman
\above
\addto
\array
\begin
\cline
\count
\ddots
\depth
\dimen
\frame
\fussy
\hfill
\hfuzz
\hline
\hrule
\hsize
\hskip
\ifcat
\ifdim
\ifeof
\ifnum
\ifodd
\index
\input
\label
\large
\ldots
\leqno
\lower
\lrbox
\minus
\mkern
\month
\mskip
\newif
\outer
\qquad
\raise
\relax
\right
\roman
\small
\smash
\space
\title
\today
\unlhd
\unrhd
\value
\vfill
\vfuzz
\vline
\vrule
\vsize
\vskip
\width
\write
\LaTeXe
\accent
\arabic
\author
\bezier
\center
\circle
\csname
\divide
\family
\forall
\global
\halign
\height
\hspace
\ifcase
\ifhbox
\iftrue
\ifvbox
\ifvoid
\indent
\insert
\lccode
\limits
\mathbf
\mathit
\mathop
\mathrm
\mathsf
\mathtt
\muskip
\nocite
\nocorr
\number
\obeycr
\openin
\output
\parbox
\parsep
\pounds
\secdef
\setbox
\sfcode
\sloppy
\string
\symbol
\textbf
\textit
\textmd
\textrm
\textsc
\textsf
\textsl
\texttt
\textup
\thanks
\topsep
\typein
\uccode
\uchyph
\unhbox
\unkern
\unskip
\unvbox
\usebox
\valign
\vector
\vspace
\vsplit
\Diamond
\advance
\badness
\bibcite
\bibdata
\bibitem
\bibname
\bigskip
\botmark
\caption
\catcode
\chapter
\chardef
\closein
\dashbox
\delcode
\dotfill
\endgraf
\endlist
\endmath
\errhelp
\everycr
\fboxsep
\footins
\headsep
\hfilneg
\hoffset
\iffalse
\ifhmode
\ifinner
\ifmmode
\ifvmode
\include
\itemize
\itemsep
\iterate
\itshape
\jobname
\lastbox
\leaders
\leadsto
\lefteqn
\makebox
\mathbin
\mathcal
\mathord
\mathrel
\meaning
\medskip
\message
\newfont
\newhelp
\newline
\newpage
\noalign
\nofiles
\openout
\pageref
\parskip
\pausing
\penalty
\picture
\poptabs
\protect
\qbezier
\radical
\refname
\rootbox
\savebox
\scshape
\section
\shipout
\showbox
\showthe
\skipdef
\slshape
\special
\stretch
\subitem
\tabbing
\tabskip
\tabular
\textbar
\thempfn
\thepage
\thepart
\toksdef
\topmark
\topskip
\typeout
\unhcopy
\unvcopy
\upshape
\usefont
\vadjust
\vcenter
\vfilneg
\voffset
\appendix
\bfseries
\bibstyle
\boldmath
\citation
\cleaders
\closeout
\countdef
\dimendef
\document
\endarray
\endgroup
\endinput
\endlrbox
\eqnarray
\equation
\everyjob
\everypar
\fboxrule
\floatsep
\fnsymbol
\fontname
\fontsize
\footnote
\footskip
\framebox
\glossary
\hbadness
\labelsep
\language
\lastkern
\lastskip
\leftmark
\leftskip
\lineskip
\markboth
\mathchar
\mathcode
\mathopen
\maxdepth
\mdseries
\minipage
\moveleft
\multiply
\multiput
\newcount
\newdimen
\newlabel
\noexpand
\noindent
\nolimits
\nonumber
\nullfont
\overline
\pagegoal
\pagename
\parshape
\partname
\patterns
\prevgraf
\proclaim
\pushtabs
\raisebox
\rmfamily
\samepage
\sffamily
\skewchar
\sqsubset
\sqsupset
\stackrel
\textfont
\textless
\theenumi
\thehours
\thetable
\trivlist
\ttfamily
\underbar
\vbadness
\verbatim
\vpageref
\vphantom
\xleaders
\Leftarrow
\LoadClass
\addvspace
\backslash
\batchmode
\bibindent
\centering
\clearpage
\columnsep
\delimiter
\endcenter
\endcsname
\enumerate
\everyhbox
\everymath
\everyvbox
\firstmark
\flushleft
\fontdimen
\fontshape
\futurelet
\hangafter
\hlinefill
\hrulefill
\immediate
\indexname
\intextsep
\leftarrow
\linebreak
\linewidth
\listfiles
\looseness
\lowercase
\makeindex
\makelabel
\maketitle
\marginpar
\markright
\mathclose
\mathgroup
\mathinner
\mathpunct
\medmuskip
\moveright
\muskipdef
\newlength
\nonscript
\onecolumn
\pagebreak
\pagedepth
\pagestyle
\pagetotal
\paragraph
\parindent
\partopsep
\prevdepth
\quotation
\restorecr
\rightmark
\rightskip
\setlength
\showlists
\sloppypar
\smallskip
\spaceskip
\tabcolsep
\tablename
\textstyle
\textwidth
\theenumiv
\thefigure
\thinlines
\tolerance
\topmargin
\twocolumn
\underline
\unpenalty
\uppercase
\Rightarrow
\addpenalty
\aftergroup
\begingroup
\botfigrule
\dblfigrule
\deadcycles
\enditemize
\endpicture
\endtabbing
\endtabular
\ensuremath
\errmessage
\escapechar
\figurename
\flushright
\fontfamily
\fontseries
\globaldefs
\hangindent
\headheight
\hyphenchar
\indexspace
\itemindent
\labelenumi
\labelitemi
\labelwidth
\leavevmode
\leftmargin
\linespread
\mathaccent
\mathchoice
\mathnormal
\newcommand
\newcounter
\newsavebox
\newtheorem
\noboundary
\nocorrlist
\normalfont
\normalsize
\numberline
\pageshrink
\paperwidth
\qbeziermax
\raggedleft
\relpenalty
\rightarrow
\scriptfont
\scriptsize
\scrollmode
\selectfont
\setcounter
\settodepth
\settowidth
\shortstack
\showoutput
\subsection
\subsubitem
\tabbingsep
\textbullet
\textdagger
\textdollar
\textemdash
\textendash
\textheight
\textnormal
\thechapter
\theminutes
\thesection
\thicklines
\thinmuskip
\topfigrule
\unboldmath
\unitlength
\usecounter
\usepackage
\varepsilon
\xspaceskip
\PackageInfo
\addtolength
\adjdemerits
\arraycolsep
\boxmaxdepth
\chaptermark
\chaptername
\clubpenalty
\columnwidth
\dblfloatsep
\deffootnote
\displaymath
\enddocument
\endeqnarray
\endequation
\endlinechar
\endminipage
\endtrivlist
\endverbatim
\expandafter
\extracolsep
\flushbottom
\fontsubfuzz
\footnotesep
\hyphenation
\includeonly
\inputlineno
\labelitemii
\labelitemiv
\lastpenalty
\leftmargini
\leftmarginv
\linepenalty
\makeatother
\mathchardef
\mathversion
\multicolumn
\newlinechar
\nolinebreak
\nonstopmode
\nopagebreak
\normalcolor
\nouppercase
\pagestretch
\paperheight
\parfillskip
\raggedright
\rightmargin
\scriptfont0
\scriptspace
\scriptstyle
\sectionmark
\setlanguage
\settoheight
\spacefactor
\stepcounter
\textcircled
\textgreater
\textsection
\theequation
\thefootnote
\thickmuskip
\topfraction
\totalheight
\CheckCommand
\IfFileExists
\MessageBreak
\PackageError
\ProvidesFile
\abstractname
\addtocounter
\addtoversion
\appendixname
\arraystretch
\baselineskip
\bibliography
\binoppenalty
\contentsline
\contentsname
\displaystyle
\displaywidth
\endenumerate
\endflushleft
\endquotation
\endsloppypar
\endtitlepage
\everydisplay
\filecontents
\fontencoding
\footnotemark
\footnoterule
\footnotesize
\footnotetext
\ignorespaces
\labelitemiii
\leftmarginii
\leftmarginiv
\leftmarginvi
\listoftables
\makeatletter
\makeglossary
\marginparsep
\mathellipsis
\mathsterling
\mathsurround
\negthinspace
\nobreakspace
\oldstylenums
\overfullrule
\pretolerance
\raggedbottom
\renewcommand
\romannumeral
\setpapersize
\showboxdepth
\showoverfull
\splitbotmark
\splittopskip
\subparagraph
\textellipsis
\textfloatsep
\textfraction
\textsterling
\theparagraph
\tracingfonts
\tracingpages
\tracingstats
\widowpenalty
\CurrentOption
\DeclareOption
\MakeLowercase
\MakeUppercase
\OptionNotUsed
\ProvidesClass
\addtocontents
\brokenpenalty
\columnseprule
\discretionary
\displayindent
\displaylimits
\documentclass
\documentstyle
\doublerulesep
\endflushright
\errorstopmode
\frenchspacing
\hyphenpenalty
\lefthyphenmin
\leftmarginiii
\linethickness
\listoffigures
\listparindent
\listtablename
\marginparpush
\maxdeadcycles
\medskipamount
\oddsidemargin
\outputpenalty
\pagenumbering
\paragraphmark
\splitmaxdepth
\subsubsection
\textbackslash
\textbraceleft
\textdaggerdbl
\textparagraph
\textquoteleft
\texttrademark
\thempfootnote
\thesubsection
\thispagestyle
\tracingmacros
\tracingonline
\tracingoutput
\AtEndOfPackage
\ExecuteOptions
\Leftrightarrow
\NeedsTeXFormat
\PackageWarning
\ProcessOptions
\RequirePackage
\arrayrulewidth
\atopwithdelims
\bottomfraction
\dbltopfraction
\enddisplaymath
\evensidemargin
\holdinginserts
\leftrightarrow
\lineskiplimits
\listfigurename
\marginparwidth
\mathunderscore
\newenvironment
\overwithdelims
\pagefilstretch
\predisplaysize
\refstepcounter
\righthyphenmin
\showboxbreadth
\splitfirstmark
\subsectionmark
\suppressfloats
\tabularnewline
\textasciitilde
\textbraceright
\textexclamdown
\textquoteright
\textregistered
\textunderscore
\AtBeginDocument
\ProvidesPackage
\abovewithdelims
\addcontentsline
\afterassignment
\baselinestretch
\cleardoublepage
\dbltextfloatsep
\defaultskewchar
\delimiterfactor
\enlargethispage
\exhyphenpenalty
\floatingpenalty
\includegraphics
\insertpenalties
\newmathalphabet
\normalmarginpar
\pagefillstretch
\tableofcontents
\textasciicircum
\textsuperscript
\thefootnotemark
\thesubparagraph
\tracingcommands
\tracingrestores
\abovecaptionskip
\abovedisplayskip
\belowcaptionskip
\belowdisplayskip
\descriptionlabel
\emergencystretch
\interlinepenalty
\nonfrenchspacing
\pagefilllstretch
\renewenvironment
\reversemarginpar
\scriptscriptfont
\subparagraphmark
\textcompwordmark
\textquestiondown
\textquotedblleft
\textvisiblespace
\thesubsubsection
\tracinglostchars
\InputIfFileExists
\bibliographystyle
\defaulthyphenchar
\errorcontextlines
\floatpagefraction
\predisplaypenalty
\scriptscriptstyle
\subsubsectionmark
\textquotedblright
\tracingparagraphs
\PassOptionsToClass
\defaultscriptratio
\delimitershortfall
\finalhypendemerits
\nulldelimiterspace
\postdisplaypenalty
\textperiodcentered
\displaywidowpenalty
\DeclareRobustCommand
\PackageWarningNoLine
\PassOptionsToPackage
\dblfloatpagefraction
\doublehyphendemerits
\textasteriskcentered
\DeclareOldFontCommand
\abovedisplayshortskip
\belowdisplayshortskip
\defaultscriptscriptratio
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+217343
View File
File diff suppressed because it is too large Load Diff