Creative Commons License R.Muralikrishnan, MPI for Empirical Aesthetics. This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International Licence.
#!/usr/bin/awk
#####################################################
# Awk script to generate phoneme frequency counts
#
# Input: Multiple textfiles all at once!!!
#        Each file contains the phonemes of one poem
#        on a single line.
#
# Output: Individual phoneme counts/word/poem and
#         Total number of phonemes//wordpoem
#
# Execute as: awk -f Count-Phoneme-Frequency--Per-Word.awk *.txt >> Output.out
#
# Notice that we process all files in one go from two different folders.
# This ensures that the phoneme list contains all the phonemes occurring in
# all the poems in the two folders.  So if a particular phoneme is not present in any 
# of the first set of poems but does in one of the other set, it will still be listed
# for the first set, with a 0 count.  This is achieved by printing the output at the very last,
# after treating all the files from both folders all at once.
# 
# Author: R.Muralikrishnan
#####################################################


BEGIN{ RS = "\t"; # Make sure we read tab as the Record-Separator...(instead of the default \n)
                  # so each tab-separated word will be treated as a separate 'row'
                  ### RESET THIS before using the Two-Way I/O facility to run shell commands.
                  ### OTHERWISE shell interprets things in a weird weird way!!!  
                  ### (Yes, we ran into problems before we considered this...there were \n line breaks where we wanted none in the output of the Two-Way I/O!)
  
       FS = " ";
  
} 

# Treat all the files supplied all at once, one after the other.
BEGINFILE{ # This rule applies at the beginning of each input file. Needs GAWK.
    PoemCounter = PoemCounter + 1;
    split(FILENAME, A_Temp, ".");
    PoemName[PoemCounter] = A_Temp[1];
}

{

    WordsPerPoem[PoemCounter] = WordsPerPoem[PoemCounter] + 1; # Since RS = \t and FS = default (space), in effect, each word is a row...
                                                               # ...and each phoneme within a word is a field.
                                                               # So we will be here for every word in every poem once.
                                                               
    WordCounter = FNR;
    
    # In the input file, wherever /ɐ/ occurs, it appears as forming a diphthong 
    # with its preceding vowel. But such pairs should be treated as 
    # two individual vowels rather than diphthongs.
    # So insert a space before /ɐ/ globally for the current line.
    
    N_of_Replacements = gsub(/ɐ/," ɐ"); # Replace /ɐ/ with / ɐ/. Gsub automatically returns the number of times replacements occurred.
    
    for (i=1; i<=NF; i++)
    {
      PhonCountPerPoemPerWord[PoemCounter WordCounter $i] = PhonCountPerPoemPerWord[PoemCounter WordCounter $i] + 1;
            
      PhonCountAcrossPoems[$i] =  PhonCountAcrossPoems[$i] + 1;
      # Ultimately, the elements of PhonCountAcrossPoems would be a list of all the phonemes that occurred across all the poems.
      
      TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] = TotalPhonCountPerPoemPerWord[PoemCounter WordCounter] + 1;
          
    }
  

}

# At the very end, after going through all the poems, generate a uniform list of phonemes and their counts (0 if a given phoneme didn't occur in a given poem) for each word in each poem.

END{ RS="\n";  ### Reset the Record-Separator to the default value!!!
               ### Very crucial, because we're going to run shell commands using Two-Way I/O.
               ### See the Begin rule for a more elaborate comment on this.
  
  Command = "LC_ALL=C sort";
  # LC_ALL=C ensures that the traditional Unix Ascii sorting order.
  # Ref: https://www.gnu.org/software/gawk/manual/gawk.html#Two_002dway-I_002fO
  
  # For each poem...
  for (CurrentPoem=1; CurrentPoem<=PoemCounter; CurrentPoem++)
  { 

    for (CurrentWord=1; CurrentWord<=WordsPerPoem[CurrentPoem]; CurrentWord++)
    {
        printf PoemName[CurrentPoem]" Word "CurrentWord" TotalPhonemesinWord "TotalPhonCountPerPoemPerWord[CurrentPoem CurrentWord]" ";

	### Two-Way I/O Begin     
	#
	# Then, sort the grand list (i.e., across all poems) of phonemes
	# We use the Two-Way I/O pipe here, because we need the phonemes to be sent to shell sort,
	# the output of which, we need back here inside the script.
	# Ref: http://stackoverflow.com/questions/1960895/awk-assigning-system-commands-output-to-variable
	# Ref: http://www.gnu.org/software/gawk/manual/html_node/Two_002dway-I_002fO.html
	# Inelegantly, we do this for each poem all over again once, because we wanted to print
	# one line per poem, and not one line per phoneme.
 	for (Phoneme in PhonCountAcrossPoems)
 	{
 	  print Phoneme |& Command; 
 
 	}
 	close(Command, "to"); # This makes sure the write-end of the pipe is closed;
			      # This tells shell (sort) that the input stream has finished...
			      # ...a sort of EOF indication, so that 
			      # sort can do its job based on the whole input.
			      # If this is not done, both processes keep waiting for each other,
			      # in effect, ending up in a never-ending process!
	
	# And now, the output of sort can be read into the running instance of this awk script
	# since the from-end of the pipe is still open.

	# Get the sorted output, one phoneme at a time...
 	while ((Command |& getline SortedPhoneme) > 0)
 	{
	  # For each phoneme from the sorted list, print the phoneme itself,
	  # and if available, it's count from the associative array of each poem.
	  # If the phoneme didn't occur in the current poem, print the phoneme and a 0 next to it.

 	  if (PhonCountPerPoemPerWord[CurrentPoem CurrentWord SortedPhoneme] != "")
 	  {
 	    printf SortedPhoneme" "PhonCountPerPoemPerWord[CurrentPoem CurrentWord SortedPhoneme]" ";
 	  }
 	  else
 	  {
 	    printf SortedPhoneme" 0 ";
 	  }

 	}    
 	close(Command); # Close the whole/remaining pipe now...(the to end is closed already, but that's fine). # If we need to call the same command several times, it is crucial to close it first.
	#
	### Two-Way I/O End   
	
	# At the end of the line corresponding to each poem, print a newline character.
	printf "\n";
    }

  }

}