#!/usr/bin/perl -w
#-----------------------------------------------------------+
#                                                           |
# HardMask.pl                                               | 
#                                                           |
#-----------------------------------------------------------+
#  AUTHOR: James C. Estill                                  |
# CONTACT: JamesEstill_at_gmail.com                         |
# STARTED: 04/17/2007                                       |
# UPDATED: 04/17/2007                                       |
#                                                           |
# DESCRIPTION:                                              |
#  Given a soft masked fasta file, replace all lowercase    |
#  characters with a mask character. The character that is  |
#  used for masking is a variable the user can select.      |
#                                                           |
# USAGE:                                                    |
#  HardMask.pl -i InFile.fata -o OutFile.fasta -m X         |
#   -i Path to the input fasta file                         |
#   -o Path to the output fasta file                        |
#      Default is InFile.hard.masked                        |
#   -m character to mask with, default is 'N'               |
#                                                           |
#-----------------------------------------------------------+

print "The HardMask.pl program has started\n";

#-----------------------------+
# INCLUDES                    |
#-----------------------------+
use Getopt::Std;               # Allows to get options from the command line


#-----------------------------+
# VARIABLE SCOPE              |
#-----------------------------+
my $InFile;                    # Full path to input file
my $OutFile;                   # Full path to output file
my $MaskChar;                  # Character to mask with
my $PrintHelp;                 # Boolean to print the help message
my $NumRecs = 0;               # Number of Records processed

my $Usage = "\nHardMask -i InFilePath.fasta -o OutFile.fasta -m N\n".
    " -i Path to the input fasta file\n".
    " -o Path to the output fasta file\n".
    "    Default is InFile.hard.masked\n".
    " -m Character to mask with\n".
    "    Acceptable characters [N,n,X,x]\n".
    "    Default is 'N'\n\n";

#-----------------------------+
# COMMAND LINE OPTIONS        |
#-----------------------------+
my %Options;                  
getopts('i:o:m:h', \%Options);   


$InFile = $Options{i} ||
    die $Usage;
$OutFile = $Options{o} ||
    $InFile.".hard.masked";
$MaskChar = $Options{m} || "N";
$PrintHelp = $Options{h};

if ($PrintHelp)
{
    print $Usage;
    exit;
}



print "Masking:".$InFile."\n";

#-----------------------------+
# OPEN FILES                  |
#-----------------------------+
open (IN, "<".$InFile) ||
    die "Can not open input file:\n$InFile\n";

open (OUT, ">".$OutFile) ||
    die "Can not open output file:\n$OutFile\n";

#-----------------------------+
# HARD MASK ALTERNATIVE       |
#-----------------------------+
# The tr regexp does not appear to accept variables
# therefore I have to write this a bit convoluted with
# if then statements for acceptable MaskCharacters
while (<IN>)
{
    unless (m/^>/)        # Do not mask header lines 
    {
	# Mask with the selected character
	if ($MaskChar =~ "N"){
	    tr/[a-z]/N/;
	} elsif ($MaskChar =~ "X"){
	    tr/[a-z]/X/;
	} elsif ($MaskChar =~ "x"){
	    tr/[a-z]/x/;
	} elsif ($MaskChar =~ "n"){
	    tr/[a-z]/n/;
	} else {
	    die "\aERROR: A valid mask character was not selected\n$Usage\n";
	}# End of select mask character
	    
	# Print masked string to the outfile
	print OUT $_;

    } else {
	print OUT $_;
	$NumRecs++;       # For headers increment NumRecs
    }
} # End of while IN



#-----------------------------+
# CLOSE FILES AND EXIT        |
#-----------------------------+
close IN;
close OUT;

print "The HardMask Program has finished\n";
print "$NumRecs sequence records were processed\n";
exit;

#-----------------------------------------------------------+
# HISTORY                                                   |
#-----------------------------------------------------------+
# 
# 04/17/2007
# - Program started, gets input and output path from command
#   line and uses regular expression to change lowercase
#   letters to the masked character.
# - The character to mask with is a variable, but it must be
#   one from a valid list of possible masking variables.   

