mirror of
https://github.com/wassname/pysle.git
synced 2026-06-27 16:10:05 +08:00
Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 4cc4bf85ec | |||
| 4c1a26ed03 | |||
| ac8643678b | |||
| b76454f626 | |||
| ea0bc5c5cd | |||
| 5d70367bfc | |||
| 81257bdfaf | |||
| 88f79d63e8 | |||
| 2dcb92217d |
+13
-3
@@ -39,10 +39,20 @@ What can you do with this library?
|
||||
|
||||
pysle.syllabifyTextgrid(isleDict, praatioTextgrid, "words", "phones")
|
||||
|
||||
- search for words based on pronunciation::
|
||||
|
||||
e.g. Words that start with a sound, or have a sound word medially, or
|
||||
in stressed vowel position, etc.
|
||||
|
||||
see /tests/dictionary_search.py
|
||||
|
||||
Major revisions
|
||||
================
|
||||
|
||||
Ver 1.4 (July 9, 2016)
|
||||
|
||||
- added search functionality
|
||||
|
||||
Ver 1.3 (March 15, 2016)
|
||||
|
||||
- added indicies for stressed vowels
|
||||
@@ -139,7 +149,7 @@ Acknowledgements
|
||||
|
||||
Development of Pysle was possible thanks to NSF grant **IIS 07-03624**
|
||||
to Jennifer Cole and Mark Hasegawa-Johnson, NSF grant **BCS 12-51343**
|
||||
to Jennifer Cole, José Hualde, and Caroline Smith, and
|
||||
to the A*MIDEX project (n° **ANR-11-IDEX-0001-02**) to James Sneed German
|
||||
funded by the Investissements d’Avenir French Government program, managed
|
||||
to Jennifer Cole, José Hualde, and Caroline Smith, and
|
||||
to the A*MIDEX project (n° **ANR-11-IDEX-0001-02**) to James Sneed German
|
||||
funded by the Investissements d'Avenir French Government program, managed
|
||||
by the French National Research Agency (ANR).
|
||||
|
||||
+136
-1
@@ -4,10 +4,27 @@ Created on Oct 11, 2012
|
||||
@author: timmahrt
|
||||
'''
|
||||
|
||||
import io
|
||||
import re
|
||||
|
||||
charList = ['#', '&', '&r', '3r', '9r', '>', '>i', '@', 'A', 'D', 'E',
|
||||
'I', 'N', 'S', 'T', 'U', 'Z', '^', 'a', 'aI', 'aU', 'b',
|
||||
'd', 'dZ', 'd_(', 'e', 'ei', 'f', 'g', 'h', 'i', 'i:',
|
||||
'j', 'k', 'kh', 'l', 'l=', 'm', 'n', 'n=', 'oU', 'p',
|
||||
'ph', 'r', 's', 'sh', 't', 'tS', 't_(', 'th', 'u',
|
||||
'v', 'w', 'y', 'z']
|
||||
|
||||
vowelList = ['a', '@', 'e', 'i', 'o', 'u', '^', '&', '>', ]
|
||||
|
||||
|
||||
def isVowel(char):
|
||||
return any([vowel in char for vowel in vowelList])
|
||||
|
||||
|
||||
def sequenceMatch(matchChar, searchStr):
|
||||
return matchChar in searchStr
|
||||
|
||||
|
||||
class WordNotInISLE(Exception):
|
||||
|
||||
def __init__(self, word):
|
||||
@@ -30,7 +47,8 @@ class LexicalTool():
|
||||
Builds the isle textfile into a dictionary for fast searching
|
||||
'''
|
||||
lexDict = {}
|
||||
wordList = [line.rstrip('\n') for line in open(self.islePath, "rU")]
|
||||
with io.open(self.islePath, "r", encoding='utf-8') as fd:
|
||||
wordList = [line.rstrip('\n') for line in fd]
|
||||
|
||||
for row in wordList:
|
||||
word, pronunciation = row.split(" ", 1)
|
||||
@@ -60,6 +78,123 @@ class LexicalTool():
|
||||
|
||||
return pronList
|
||||
|
||||
def search(self, matchStr, numSyllables=None, wordInitial='ok',
|
||||
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||
multiword='ok'):
|
||||
return search(self.data.items(), matchStr, numSyllables=numSyllables,
|
||||
wordInitial=wordInitial, wordFinal=wordFinal,
|
||||
spanSyllable=spanSyllable,
|
||||
stressedSyllable=stressedSyllable,
|
||||
multiword=multiword)
|
||||
|
||||
|
||||
def search(searchList, matchStr, numSyllables=None, wordInitial='ok',
|
||||
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||
multiword='ok'):
|
||||
'''
|
||||
Searches for matching words in the dictionary with regular expressions
|
||||
|
||||
wordInitial, wordFinal, spanSyllable, stressSyllable, and multiword
|
||||
can take three different values: 'ok', 'only', or 'no'.
|
||||
|
||||
Special search characters:
|
||||
'V' - any vowel
|
||||
'R' - any rhotic
|
||||
'#' - word boundary
|
||||
'B' - syllable boundary
|
||||
'.' - anything
|
||||
|
||||
Regular expression syntax applies, so if you wanted to search for any
|
||||
word ending with a vowel or rhotic, matchStr = '(?:VR)#'
|
||||
'''
|
||||
|
||||
# Characters to check between all other characters
|
||||
# Don't check between all other characters if the character is already
|
||||
# in the search string or
|
||||
interleaveStr = None
|
||||
stressOpt = (stressedSyllable == 'ok' or stressedSyllable == 'only')
|
||||
spanOpt = (spanSyllable == 'ok' or spanSyllable == 'only')
|
||||
if stressOpt and spanOpt:
|
||||
interleaveStr = "\.?'?"
|
||||
elif stressOpt:
|
||||
interleaveStr = "'?"
|
||||
elif spanOpt:
|
||||
interleaveStr = "\.?"
|
||||
|
||||
if interleaveStr is not None:
|
||||
matchStr = interleaveStr.join(matchStr)
|
||||
|
||||
# Setting search boundaries
|
||||
# We search on '[^\.#]' and not '.' so that the search doesn't span
|
||||
# multiple syllables or words
|
||||
if wordInitial == 'only':
|
||||
matchStr = '#' + matchStr
|
||||
elif wordInitial == 'no':
|
||||
# Match the closest preceeding syllable. If there is none, look
|
||||
# for word boundary plus at least one other character
|
||||
matchStr = '(?:\.[^\.#]*?|#[^\.#]+?)' + matchStr
|
||||
else:
|
||||
matchStr = '[#\.][^\.#]*?' + matchStr
|
||||
|
||||
if wordFinal == 'only':
|
||||
matchStr = matchStr + '#'
|
||||
elif wordFinal == 'no':
|
||||
matchStr = matchStr + "(?:[^\.#]*?\.|[^\.#]+?#)"
|
||||
else:
|
||||
matchStr = matchStr + '[^\.#]*?[#\.]'
|
||||
|
||||
# Replace special characters
|
||||
replDict = {"V": "(?:aI|aU|ei|oU|[AEIaeiu]):?",
|
||||
"R": "[&39]?r",
|
||||
"B": "\."}
|
||||
|
||||
for char, replStr in replDict.items():
|
||||
matchStr = matchStr.replace(char, replStr)
|
||||
|
||||
# Run search for words
|
||||
compiledRE = re.compile(matchStr)
|
||||
retList = []
|
||||
for word, pronList in searchList:
|
||||
newPronList = []
|
||||
for pron in pronList:
|
||||
searchPron = pron.replace(",", "").replace(" ", "")
|
||||
if numSyllables is not None:
|
||||
if numSyllables != searchPron.count('.') + 1:
|
||||
continue
|
||||
|
||||
# Is this a compound word?
|
||||
if multiword == 'only':
|
||||
if searchPron.count('#') == 2:
|
||||
continue
|
||||
elif multiword == 'no':
|
||||
if searchPron.count('#') > 2:
|
||||
continue
|
||||
|
||||
matchList = compiledRE.findall(searchPron)
|
||||
if len(matchList) > 0:
|
||||
if stressedSyllable == 'only':
|
||||
if all(["'" not in match for match in matchList]):
|
||||
continue
|
||||
if stressedSyllable == 'no':
|
||||
if all(["'" in match for match in matchList]):
|
||||
continue
|
||||
|
||||
# For syllable spanning, we check if there is a syllable
|
||||
# marker inside (not at the border) of the match.
|
||||
if spanSyllable == 'only':
|
||||
if all(["." not in txt[1:-1] for txt in matchList]):
|
||||
continue
|
||||
if spanSyllable == 'no':
|
||||
if all(["." in txt[1:-1] for txt in matchList]):
|
||||
continue
|
||||
newPronList.append(pron)
|
||||
|
||||
if len(newPronList) > 0:
|
||||
retList.append((word, newPronList))
|
||||
|
||||
retList.sort()
|
||||
return retList
|
||||
|
||||
|
||||
def _parsePronunciation(pronunciationStr):
|
||||
'''
|
||||
|
||||
@@ -1,16 +1,19 @@
|
||||
#!/usr/bin/env python
|
||||
# encoding: utf-8
|
||||
'''
|
||||
Created on Oct 15, 2014
|
||||
|
||||
@author: tmahrt
|
||||
'''
|
||||
import codecs
|
||||
from distutils.core import setup
|
||||
setup(name='pysle',
|
||||
version='1.3.0',
|
||||
version='1.4.0',
|
||||
author='Tim Mahrt',
|
||||
author_email='timmahrt@gmail.com',
|
||||
package_dir={'pysle':'pysle'},
|
||||
packages=['pysle'],
|
||||
license='LICENSE',
|
||||
long_description=open('README.rst', 'r').read(),
|
||||
long_description=codecs.open('README.rst', 'r', encoding="utf-8").read(),
|
||||
# install_requires=[], # No requirements! # requires 'from setuptools import setup'
|
||||
)
|
||||
|
||||
@@ -35,10 +35,12 @@ returnList = pronunciationtools.findBestSyllabification(isleDict,
|
||||
searchWord,
|
||||
anotherPhoneList)
|
||||
|
||||
stressedSyllable, syllableList, syllabification, stressedIndex = returnList
|
||||
|
||||
(stressedSyllable, syllableList, syllabification,
|
||||
stressedSyllableIndexList, stressedPhoneIndexList,
|
||||
flattenedStressIndexList) = returnList
|
||||
print(searchWord)
|
||||
print(anotherPhoneList)
|
||||
print(syllableList) # We can see the first syllable was elided
|
||||
|
||||
print(stressedSyllableIndexList) # We can see the first syllable was elided
|
||||
print(stressedPhoneIndexList)
|
||||
print(flattenedStressIndexList)
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
'''
|
||||
Created on July 08, 2016
|
||||
|
||||
@author: tmahrt
|
||||
|
||||
Basic examples of common usage.
|
||||
'''
|
||||
|
||||
import random
|
||||
|
||||
from pysle import isletool
|
||||
|
||||
tmpPath = r"C:\Users\Tim\Dropbox\workspace\pysle\test\islev2.txt"
|
||||
isleDict = isletool.LexicalTool(tmpPath)
|
||||
|
||||
def printOutMatches(matchStr, numSyllables=None, wordInitial='ok',
|
||||
wordFinal='ok', spanSyllable='ok', stressedSyllable='ok',
|
||||
multiword='ok', numMatches=None, matchList=None):
|
||||
|
||||
if matchList is None:
|
||||
matchList = isleDict.search(matchStr, numSyllables, wordInitial,
|
||||
wordFinal, spanSyllable, stressedSyllable,
|
||||
multiword)
|
||||
else:
|
||||
matchList = isletool.search(matchList, matchStr, numSyllables, wordInitial,
|
||||
wordFinal, spanSyllable, stressedSyllable,
|
||||
multiword)
|
||||
|
||||
if numMatches is not None and len(matchList) > numMatches:
|
||||
random.shuffle(matchList)
|
||||
|
||||
for i, matchTuple in enumerate(matchList):
|
||||
if numMatches is not None and i > numMatches:
|
||||
break
|
||||
word, pronList = matchTuple
|
||||
print("%s: %s" % (word, repr(pronList)))
|
||||
print("")
|
||||
|
||||
return matchList
|
||||
|
||||
# 2-syllable words with a stressed syllable containing 'dV' but not word initially
|
||||
printOutMatches("dV", stressedSyllable="only", spanSyllable="no",
|
||||
wordInitial="no", numSyllables=2, numMatches=10)
|
||||
|
||||
# 3-syllable word with an 'ld' sequence that spans a syllable boundary
|
||||
printOutMatches("lBd", wordInitial="no", multiword='no',
|
||||
numSyllables=3, numMatches=10)
|
||||
|
||||
# words ending in 'inth'
|
||||
matchList = printOutMatches("InT", wordFinal="only", numMatches=10)
|
||||
|
||||
# that also start with 's'
|
||||
matchList = printOutMatches("s", wordInitial="only", numMatches=10,
|
||||
matchList=matchList, multiword="no")
|
||||
Reference in New Issue
Block a user