R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence. #!/usr/bin/awk ##################################################### # Awk script to generate phoneme frequency counts # # Input: Multiple textfiles all at once!!! # Each file contains the phonemes of one poem # on a single line. # # This version splits all diphthongs and affricate consonants into their individual parts # and counts them separately (for instance, dʒ will be split, # to count d with d, ʒ with ʒ, and not dʒ as such). # # Output: Individual phoneme counts/word/poem and # Total number of phonemes//wordpoem # # Execute as: awk -f Count-Phoneme-Frequency-Split-Phonemes--Per-Word.awk *.txt >> Output.out # # Notice that we process all files in one go from two different folders. # This ensures that the phoneme list contains all the phonemes occurring in # all the poems in the two folders. So if a particular phoneme is not present in any # of the first set of poems but does in one of the other set, it will still be listed # for the first set, with a 0 count. This is achieved by printing the output at the very last, # after treating all the files from both folders all at once. # # Author: R.Muralikrishnan ##################################################### BEGIN{ RS = "\t"; # Make sure we read tab as the Record-Separator...(instead of the default \n) # so each tab-separated word will be treated as a separate 'row' ### RESET THIS before using the Two-Way I/O facility to run shell commands. ### OTHERWISE shell interprets things in a weird weird way!!! ### (Yes, we ran into problems before we considered this...there were \n line breaks where we wanted none in the output of the Two-Way I/O!) FS = " "; } # Treat all the files supplied all at once, one after the other. BEGINFILE{ # This rule applies at the beginning of each input file. Needs GAWK. PoemCounter = PoemCounter + 1; split(FILENAME, A_Temp, "."); PoemName[PoemCounter] = A_Temp[1]; } { WordsPerPoem[PoemCounter] = WordsPerPoem[PoemCounter] + 1; # Since RS = \t and FS = default (space), in effect, each word is a row... # ...and each phoneme within a word is a field. # So we will be here for every word in every poem once. WordCounter = FNR; # In the input file, wherever /ɐ/ occurs, it appears as forming a diphthong # with its preceding vowel. But such pairs should be treated as # two individual vowels rather than diphthongs. # So insert a space before /ɐ/ globally for the current line. N_of_Replacements = gsub(/ɐ/," ɐ"); # Replace /ɐ/ with / ɐ/. Gsub automatically returns the number of times replacements occurred. for (i=1; i<=NF; i++) { # Split-Phoneme Version: Split all phoneme combinations (diphthongs and affricate consonants) # (But take care not to split long vowels such as /iː/...by checking to see if /ː/ is present) if (length($i) > 1 && index($i,"ː") == 0) # This means, the string is > one char long, { # but contains no /ː/. # So we need to split the phoneme into its component parts and count each part separately. N_of_Elements = split($i,A_Temp,""); #Notice that we're splitting by "" and not a space " ". This splits the string char by char. Split automatically returns the number of array elements. for (jj=1; jj<=N_of_Elements; jj++) { PhonCountPerPoemPerWord[PoemCounter WordCounter A_Temp[jj]] = PhonCountPerPoemPerWord[PoemCounter WordCounter A_Temp[jj]] + 1; PhonCountAcrossPoems[A_Temp[jj]] = PhonCountAcrossPoems[A_Temp[jj]] + 1; TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] = TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] + 1; } split("", A_Temp); # Delete Array } else { PhonCountPerPoemPerWord[PoemCounter WordCounter $i] = PhonCountPerPoemPerWord[PoemCounter WordCounter $i] + 1; PhonCountAcrossPoems[$i] = PhonCountAcrossPoems[$i] + 1; # Ultimately, the elements of PhonCountAcrossPoems would be a list of all the phonemes that occurred across all the poems. TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] = TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] + 1; } } } # At the very end, after going through all the poems, generate a uniform list of phonemes and their counts (0 if a given phoneme didn't occur in a given poem) for each word in each poem. END{ RS="\n"; ### Reset the Record-Separator to the default value!!! ### Very crucial, because we're going to run shell commands using Two-Way I/O. ### See the Begin rule for a more elaborate comment on this. Command = "LC_ALL=C sort"; # LC_ALL=C ensures that the traditional Unix Ascii sorting order. # Ref: https://www.gnu.org/software/gawk/manual/gawk.html#Two_002dway-I_002fO # For each poem... for (CurrentPoem=1; CurrentPoem<=PoemCounter; CurrentPoem++) { for (CurrentWord=1; CurrentWord<=WordsPerPoem[CurrentPoem]; CurrentWord++) { printf PoemName[CurrentPoem]" Word "CurrentWord" TotalPhonemesinWord "TotalPhonCountPerPoemPerWord[CurrentPoem CurrentWord]" "; ### Two-Way I/O Begin # # Then, sort the grand list (i.e., across all poems) of phonemes # We use the Two-Way I/O pipe here, because we need the phonemes to be sent to shell sort, # the output of which, we need back here inside the script. # Ref: http://stackoverflow.com/questions/1960895/awk-assigning-system-commands-output-to-variable # Ref: http://www.gnu.org/software/gawk/manual/html_node/Two_002dway-I_002fO.html # Inelegantly, we do this for each poem all over again once, because we wanted to print # one line per poem, and not one line per phoneme. for (Phoneme in PhonCountAcrossPoems) { print Phoneme |& Command; } close(Command, "to"); # This makes sure the write-end of the pipe is closed; # This tells shell (sort) that the input stream has finished... # ...a sort of EOF indication, so that # sort can do its job based on the whole input. # If this is not done, both processes keep waiting for each other, # in effect, ending up in a never-ending process! # And now, the output of sort can be read into the running instance of this awk script # since the from-end of the pipe is still open. # Get the sorted output, one phoneme at a time... while ((Command |& getline SortedPhoneme) > 0) { # For each phoneme from the sorted list, print the phoneme itself, # and if available, it's count from the associative array of each poem. # If the phoneme didn't occur in the current poem, print the phoneme and a 0 next to it. if (PhonCountPerPoemPerWord[CurrentPoem CurrentWord SortedPhoneme] != "") { printf SortedPhoneme" "PhonCountPerPoemPerWord[CurrentPoem CurrentWord SortedPhoneme]" "; } else { printf SortedPhoneme" 0 "; } } close(Command); # Close the whole/remaining pipe now...(the to end is closed already, but that's fine). # If we need to call the same command several times, it is crucial to close it first. # ### Two-Way I/O End # At the end of the line corresponding to each poem, print a newline character. printf "\n"; } } }