#!/usr/bin/perl -w use strict; use Getopt::Long; # Variables my ($mapfile, $bedfile, $help, $motifs, $LDdir, $FASTAdir, $HapMap, $userr2, $combo, $regionScore, $logfile, $outfile, $typeOfRun)=''; # command line options my %InMap = (); # snps found in the user provided MAP file my %InRegion = (); # snps located in genomic region of interest my %InMotif = (); # snps located within a motif my %candidateList = (); # snps that are selected based on user input my %LDList = (); # list and info on markers for which a proxy should be found my %finalList = (); # final list of genotyped and LD-based proxy snps my %chromosome = (); # save info on chromosomes so no LD files have to be searched in vein my %LDchromosome = (); # search only these chromosomes for LD markers my %AllHapMart = (); # Hash with physical position (chromosome and basepair) for each rs-number my %NumberOfSNPsInMotif = (); # This hash is used to count how many motifs have 1,2,3, etc. SNPs. my %PositionOfSNPInMotif = (); # This hash remembers how often SNPs have been seen in a given position. my $totalNumberOfMotifs = 0; # This variable keeps track of the total number of motifs found in the genome. my %ProblemSNPs = (); # hash to save potential problem SNPs that occur in motifs with more than one SNP. my @potentialProblemSNPs; # array of SNPs that occur in motifs with more than 1 polymorphism $typeOfRun = 'long'; # sets the default for the run to long # # Message about ReMo-SNPs and how to use it # sub usage() { my $errmsg = qq{ This program searches for polymorphic markers ('SNPs') in specified genomic regions and/or motivs. User input specifies whether a marker of interest needs to occur A) in the motiv AND the genomic region, B) in the motiv OR the genomic region, C) in both region and motiv plus in those regions/motivs that exceed a user defined score threshold. The program allows users to specify a score for each genomic region on the command line, to set thresholds for these scores. Above the threshold, a region will be included regardless of whether the motiv is present within the region or not. The score option thus provides the user with a flexible option to include regions with strong experimental support whether or not the region and the motiv co-occur. A list or markers in the relevant motifs or genomic regions is provided. Furthermore, the user can specify an r2 threshold to look for proxy markers (e.g., in an LD block; one SNP may disrupt a transcription factor binding site but another SNP happens to be present on the genotyping platform). usage: $0 [--help ] --map filename --bed filename --Motifs filename --r2 [0-1] --combo [AND/OR/SCORE] --regionScore [0-oo] --LDdir [directory path and name] --FASTAdir [directory path and name] --HapMap filename --log filename --help or --h or --? : this (help) message --map : Filename for PLINK format Mapfile with list of post-QC markers --bed : Filename for .bed format file containing interesting genomic regions and a score for each region (assigned default: 1) --Motifs : Text file with a Motif --r2 : Threshold for inclusion of proxy markers --combo : AND, OR or SCORE. AND selects snps in interesting regions that are located within a motiv. OR selects all regions and motivs. SCORE works like AND but additionally allows in markers that are in regions or motivs with a score above a certain threshold (see 'regionScore' below) --regionScore : threshold for region score above which to include snps in a region without being located in a motif; --LDdir : Directory where the HapMap LD files are located (do NOT unzip!!!) --FASTAdir : Directory where the IUPAC code-masked FASTA files are located --HapMap : Filename of HapMap file with description of known SNPs --typeOfRun : select 'short' for descriptive statistics on the motifs only and 'medium' for descriptive statistics including information on which SNPs occur in motifs with more than one SNP. The default value is 'long' for a full run. --log : Filename for logfile (default: ReMo.log) --out : Filename for outfile (default: ReMo.out) }; print STDERR "$errmsg\n"; exit; } GetOptions('map=s' => \$mapfile, 'bed=s' => \$bedfile, 'help|?' => \$help, 'Motifs=s' => \$motifs, 'LDdir=s' => \$LDdir, 'FASTAdir=s' => \$FASTAdir, 'HapMap=s' => \$HapMap, 'r2=f' => \$userr2, 'combo=s' => \$combo, 'regionScore=f' => \$regionScore, 'log=s' => \$logfile, 'out=s' => \$outfile, 'typeOfRun=s' => \$typeOfRun) or usage(); usage() if $help; # Open log file for feedback unless($logfile){$logfile = 'ReMo.SNPs.log';} open (LOG, ">$logfile") or die "\nCannot write logfile!\n"; print LOG "This is $0\nAnalysis started with the following arguments:\n"; # Go through the HapMap-file and make a hash with physical position (chromosome and base pair) for each rs-number. This information will be used later on. open (HAPMART, "$HapMap") or die "\nCould not open Hapmap file!\n"; while (){ chomp; my @hapmart = split; my $pos = $hapmart[1]; my $chr = $hapmart[0]; $chromosome{$hapmart[2]}=$chr; push(@{$AllHapMart{$chr}}, [@hapmart]); # save this for later } close (HAPMART); # Step 1: Find markers in motifs (genome-wide) # The program goes through each FASTA-file, one at a time, to search for the motif. When found the program searches through the motif for SNPs. In addition the program counts how many times the motif was found (with or without SNPs), how many times a SNP occured in position 1, 2, 3 and so on, and how many motifs that had one SNP, two SNPs, three SNPs and so on. For motifs with more than one SNP a warning is anounced and the position of these potentially problematic SNPs are printed out in the terminal window. print "Step 1 ...\n"; my @fastafiles = `ls $FASTAdir/*.fa`; my %RawPosition; open (MOTIFFILE, "$motifs") or die "\nCould not open Motif file $motifs!\n"; my $motif; while (){ chomp ($_); $_ =~ tr/a-z/A-Z/; $motif = $motif.$_; } my $motiflength = length($motif); print "My motif is $motiflength character long\n"; my @motifarray = split (//, $motif); my $changedmotif = ''; foreach my $motifletter (@motifarray){ if ($motifletter =~ s/A/[ARMWDHV]/g) {$changedmotif = $changedmotif.$motifletter; next;} if ($motifletter =~ s/C/[CYMSBHV]/g) {$changedmotif = $changedmotif.$motifletter; next;} if ($motifletter =~ s/G/[GRKSBDV]/g) {$changedmotif = $changedmotif.$motifletter; next;} if ($motifletter =~ s/T/[TYKWBDH]/g) {$changedmotif = $changedmotif.$motifletter; next;} if ($motifletter =~ s/R/[RAGMKWSBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/Y/[YCTMKWSBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/M/[MCARYWSBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/K/[KTGRYWSBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/W/[WTARYMKBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/S/[SCGRYMKBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/B/[BCTGRYMKWSDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/D/[DATGRYMKWSBHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/H/[HATCRYMKWSBDV]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/V/[VACGRYMKWSBDH]/g) {$changedmotif=$changedmotif.$motifletter; next;} if ($motifletter =~ s/N/[ACGTRYMKWSBDHV]/g) {$changedmotif=$changedmotif.$motifletter; next;} die "\nThis motif contains the illegal character $motifletter!\n"; } my $complimentarymotif = $motif; $complimentarymotif =~ tr/ACGTRYMKWSBDHV/TGCAYRKMWSVHDB/; my $reversecomplimentarymotif = scalar reverse ($complimentarymotif); print "The original motif is: $motif\nThe reverse complement is: $reversecomplimentarymotif\n"; my @reversearray = split (//, $reversecomplimentarymotif); my $changedreversemotif = ''; foreach my $reverseletter (@reversearray){ if ($reverseletter =~ s/A/[ARMWDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/C/[CYMSBHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/G/[GRKSBDV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/T/[TYKWBDH]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/R/[RAGMKWSBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/Y/[YCTMKWSBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/M/[MCARYWSBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/K/[KTGRYWSBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/W/[WTARYMKBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/S/[SCGRYMKBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/B/[BCTGRYMKWSDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/D/[DATGRYMKWSBHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/H/[HATCRYMKWSBDV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/V/[VACGRYMKWSBDH]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} if ($reverseletter =~ s/N/[ACGTRYMKWSBDHV]/g) {$changedreversemotif = $changedreversemotif.$reverseletter; next;} die "\nThis motif contains the illegal character $reverseletter!\n"; } print "The IUPAC-motif is: $changedmotif\nThe reverse IUPAC-complement is: $changedreversemotif\n"; my $character = $motiflength-1; foreach my $fastafile (@fastafiles) { chomp($fastafile); open (FASTAINPUT, "$fastafile") or die "\nCould not open the snp masked fasta file $fastafile!\n"; $fastafile =~ /(chr\d+).subst/; my $chromosome = $1; print LOG "Currently working on Chromosome $chromosome\n"; my $sequence; my $linecounter; my $position = 0; while () { chomp ($_); if ($_=~ />/) {next;} $linecounter++; $_ =~ tr/a-z/A-Z/; $sequence = $sequence.$_; } my $sequencelength = length($sequence); print LOG "$sequencelength\t$linecounter\n"; while ($position != $sequencelength-$character) { my $testsequence = substr($sequence, $position, $motiflength); $position++; if (($testsequence =~ $changedmotif) or ($testsequence =~ $changedreversemotif)) {$totalNumberOfMotifs++;} # this counts the total number of motifs encountered unless ($testsequence =~ /[MRWSYUHDVKB]/) {next;} # if the motif does not contain any SNP, it is of no further interest if (($testsequence =~ $changedmotif) or ($testsequence =~ $changedreversemotif)) { my $snpfound = 0; for (my $i=0;$i<=$character; $i++){ my $snpsequence = substr($testsequence, $i, 1); unless ($snpsequence =~ /[ATGC]/){ my $resultposition = $position+$i; push (@{$RawPosition{$chromosome}},[$testsequence, $resultposition]); $PositionOfSNPInMotif{$i}++; $snpfound++; } # the end of the unless-loop } # the end of the for-loop # now we know how many SNPs occurred in the motif. Save this in a hash: $NumberOfSNPsInMotif{$snpfound}++; # code to provide a warning for those SNPs that lie in motifs with multiple polymorphisms, which may indicate low sequence quality if ($snpfound>=2){ push(@{$ProblemSNPs{$chromosome}}, [$position, $character]); print "Problem SNP found: $chromosome\t$position\t$character\n"; } } # the end of the if-loop } # the end of the while-loop close (FASTAINPUT); } # the end of foreach chromosome print "The motif was found $totalNumberOfMotifs times (with or without any SNPs in it).\n"; foreach my $key (sort { $a <=> $b} keys %PositionOfSNPInMotif){ print "Position $key had $PositionOfSNPInMotif{$key} mutations.\n"; } foreach my $key (sort { $a <=> $b} keys %NumberOfSNPsInMotif){ print "There were $NumberOfSNPsInMotif{$key} motifs with $key SNP(s)\n"; } while (my ($reportchr, $reportarray) = each %RawPosition){ print LOG "$reportchr\t". scalar(@{$reportarray}) . "\n";} # Step 2: Find rs-numbers for markers from Step 1 # By comparing the hash with SNPs identified to be placed in motifs genome-wide, that was created in the previous step, with the HapMap-data-hash created in the beginning, the program identifies the rs-numbers of each SNP placed in motifs genome-wide. print "Step 2 ...\n"; while (my ($chromosome, $positionarray) = each %RawPosition){ foreach my $entry(@{$positionarray}){ my $foundthismotifSNP = 0; my $position = @{$entry}[1]; while (my ($key, $value) = each %AllHapMart){ unless ($key eq $chromosome){next;} foreach my $region(@{$value}){ if ($position eq @{$region}[1]) {print LOG "@{$region}\n"; my $HapMapRs = @{$region}[2]; $InMotif{$HapMapRs}=1; $foundthismotifSNP = 1; last;} } } unless ($foundthismotifSNP){print "No rs-number was found for the following sequence: " . @{$entry}[0] . " at position " . @{$entry}[1] . " on chromosome " . $chromosome . "\n";} } } print LOG keys(%InMotif) . " markers have been found in the motif of interest genome-wide.\n"; # Step 3: Find markers in genomic regions of interest # The program goes through the user provided .bed file with information on the genomic regions of interest. For regions where the user has not defined a score this is set to 1. The information with the rs-number and the score-value is then saved in a hash for later use. print "Step 3 ...\n"; my %regions; open (BEDFILE, "$bedfile") or die "\nBedfile could not be opened!\n"; while (){ chomp; my @chip = split; my $chr = $chip[0]; my $start = $chip[1]; my $end = $chip[2]; unless (defined $chip[3]){$chip[3]='noname';} unless (defined $chip[4]){$chip[4]=1;} push(@{$regions{$chr}}, [$start, $end, $chip[3], $chip[4]]); # see if any SNPs are located within the region and assign the score to the SNP while (my ($key, $value) = each %AllHapMart){ unless ($key eq $chr){next;} foreach my $snpdata(@{$value}){ if((@{$snpdata}[1] >= $start) and (@{$snpdata}[1] <= $end)) { $InRegion{@{$snpdata}[2]}=$chip[4];} # assigns score value to InRegion hash for later use } } } close (BEDFILE); print LOG keys(%InRegion) . " markers have been found in genomic regions of interest.\n"; # Step 4: Combine lists according to user input, generate candiatelist # The data generated in step 2 and 3 are now used in order to generate candidate lists of SNPs placed in interesting regions and/or motifs. The user specified AND/OR/SCORE options are used in this step. If the user has chosen a "short"-run the program writes the motifsnplits.txt and regionsnplist.txt files here before it ends. If the user instead has chosen the "medium"-run the program identifies the rs-numbers for the potentially problematic SNPs identified in step 1 (motifs that contain more than one SNP). The information is then printed in the terminal window before the program ends. If neither the "short-" nor the "medium"-option were chosen the program will by default do the "long" run and continue with Step 5 after creating the candidate SNP-lists. print "Step 4 ...\n"; if ($combo eq 'AND'){ foreach my $motifSNP (keys %InMotif){ if ($InRegion{$motifSNP}){$candidateList{$motifSNP}=1;} }} if ($combo eq 'OR'){ foreach my $motifSNP (keys %InMotif){$candidateList{$motifSNP}=1;} foreach my $regionSNP (keys %InRegion){$candidateList{$regionSNP}=1;} } if ($combo eq 'SCORE'){ foreach my $motifSNP (keys %InMotif){if ($InRegion{$motifSNP}){$candidateList{$motifSNP}=1;}} foreach my $regionSNP (keys %InRegion){if ($InRegion{$regionSNP}>= $regionScore){$candidateList{$regionSNP}=1;}} } print LOG scalar (keys %candidateList) . " total candidate SNPs!\n"; if ($typeOfRun eq "short"){ open(MOTIFSNPLIST, '>motifsnplist.txt') or die "\nCould not write the mofif SNP list!\n"; foreach my $motifSNPrsnumber (keys %InMotif){ print MOTIFSNPLIST "$motifSNPrsnumber\n"; } close MOTIFSNPLIST; open(REGIONSNPLIST, '>regionsnplist.txt') or die "\nCould not write the region SNP list!\n"; foreach my $regionSNPrsnumber (keys %InRegion){ print REGIONSNPLIST "$regionSNPrsnumber\n"; } close REGIONSNPLIST; die "\nDone with descriptive statistics. Run ends here.";} #-------------------------------------- # End here for short runs #-------------------------------------- if ($typeOfRun eq "medium"){ while (my ($problemchr, $problementry) = each %ProblemSNPs){ foreach my $problemcoordinates(@{$problementry}){ while (my ($hapmapchr, $hapmaprow) = each %AllHapMart){ # print "Comparing problemchr $problemchr with hapmapchr $hapmapchr\n"; unless ($problemchr eq $hapmapchr){next;} foreach my $hapmapcoordinates(@{$hapmaprow}){ #print "Hapmap: @{$hapmapcoordinates}[1]\tProblem: @{$problemcoordinates}[0]\t @{$problemcoordinates}[1]\n"; if ((@{$hapmapcoordinates}[1] >= @{$problemcoordinates}[0]) and (@{$hapmapcoordinates}[1] <= (@{$problemcoordinates}[0]+@{$problemcoordinates}[1]))){ push(@potentialProblemSNPs, @{$hapmapcoordinates}[2]);} } } } } print "\nThe following SNPs may be problematic because they are located in motifs with more than one SNP:\n" . "@potentialProblemSNPs" . "\n"; die "\nDone with descriptive statistics and warning about SNPs. This run ends here."; } #-------------------------------------- # End here for medium runs #-------------------------------------- # Step 5: go through the Mapfile and obtain a list of all genotyped markers # The program uses the information provided in the map-file to make a hash of all genotyped markers in the material. print "Step 5...\n"; open (MAPFILE, "$mapfile") or die "\nNo mapfile found!\n"; while (){ chomp; my @mapinput = split; $InMap{$mapinput[1]}=1; } close MAPFILE; foreach my $option (@ARGV){ print LOG "$option\n";} print LOG keys(%InMap) . " markers have been read from the mapfile.\n"; # Step 6: Find proxy markers for those markers that have not been genotyped # First the program compares the list with candidate SNPs, created in step 4, with the hash of genotyped markers created in step 5, in order to identify which of the candidate SNPs that are included in the material. For the ungenotyped markers the program then will search for SNPs in high linkage disequilibrium with the candidate SNP by searching through the LD-data files. The identified SNPs are written to a lddata.txt file. Then the program goes through the list of identified LD-markers to see if any of these have been genotyped in the material. If a candidate SNP has several LD-SNPs the one with the highest r2-value is chosen. These SNPs will be written to a genotyped.lddata.txt file. print "Step 6 ...\n"; open (LDDATA, '>lddata.txt') or die "\nCould not write LD data!\n"; foreach my $candidatemarker(keys %candidateList){ if ($InMap{$candidatemarker}){$finalList{$candidatemarker}=1;next;} $LDchromosome{$chromosome{$candidatemarker}}=1; $LDList{$candidatemarker}=1; } foreach my $testchromosome (keys %LDchromosome){ open (HAPMAP_DATA, "zcat $LDdir/ld_$testchromosome\_CEU.txt.gz |") or die "\nCould not find gziped LD data for chromosome $testchromosome: $!\n"; while () { my @hapmapdata = split; my $rs1 = $hapmapdata [3]; my $rs2 = $hapmapdata [4]; my $r2 = $hapmapdata [6]; unless($r2 >= $userr2) {next;} foreach my $rsnr (keys %LDList) { if (($rsnr eq $rs1) or ($rsnr eq $rs2)) {print LDDATA "@hapmapdata\t$rsnr\t$testchromosome\n"}; } } close (HAPMAP_DATA); } close LDDATA; print LOG scalar (keys %finalList) . " interesting SNPs with genotypes!\n"; # Clean LD markers, only keeping genotyped markers with the highest LD my %seen; open (CLEAN, '> genotyped.lddata.txt'); open (LDINPUT, 'lddata.txt') or die "\nCouldn't open the lddata I just wrote!\n"; while () { chomp; my @input = split; unless($InMap{$input[3]} or $InMap{$input[4]}){next;} # we already know that one of the SNPs has no genotype. If the other one has no genotype either, then it does not make sense to pick it as an LD SNP. Therefore, it is better to try the next line. my $rs1 = $input[9]; my $r2 = $input[6]; if ((not $seen{$rs1}) or ($r2 > $seen{$rs1}->[6])) { $seen{$rs1} = [@input]; } } while (my ($key, $value) = each(%seen)){ print CLEAN "@{$value}\n"; } close (LDINPUT); close (CLEAN); # Step 7: Write output: First the genotyped markers, then the LD markers. Then, to a separate list, the markers that should be analyzed but have not been genotyped and have no good proxy print "Step 7 ...\n"; # Open out file to write results unless($outfile){$outfile = 'ReMo.SNPs.out';} open (OUT, ">$outfile") or die "\nCannot write the ouptut!\n"; my %printedmarker=(); print OUT "Marker\tInRegion\tInMotif\tself_or_proxy\tOriginal_Marker\tr2\n"; foreach my $resultmarker (keys %finalList){ unless (defined $InRegion{$resultmarker}){$InRegion{$resultmarker}= 0;} unless (defined $InMotif{$resultmarker}){$InMotif{$resultmarker}=0;} print OUT "$resultmarker\t$InRegion{$resultmarker}\t$InMotif{$resultmarker}\tself\tna\tna\n"; $printedmarker{$resultmarker}=1; } open (LDSNPS, 'genotyped.lddata.txt') or die "\nCould not find clean LD data!\n"; while (){ my @cleaninput = split; unless(defined $InRegion{$cleaninput[3]}){$InRegion{$cleaninput[3]}= 0;} unless (defined $InMotif{$cleaninput[3]}){$InMotif{$cleaninput[3]}=0;} unless(defined $InRegion{$cleaninput[4]}){$InRegion{$cleaninput[4]}= 0;} unless (defined $InMotif{$cleaninput[4]}){$InMotif{$cleaninput[4]}=0;} if ($cleaninput[9] eq $cleaninput[3]){print OUT "$cleaninput[4]\t$InRegion{$cleaninput[9]}\t$InMotif{$cleaninput[9]}\tproxy\t$cleaninput[9]\t$cleaninput[5]\n";} if ($cleaninput[9] eq $cleaninput[4]){print OUT "$cleaninput[3]\t$InRegion{$cleaninput[9]}\t$InMotif{$cleaninput[9]}\tproxy\t$cleaninput[9]\t$cleaninput[5]\n";} $printedmarker{$cleaninput[9]}=1; } close LDSNPS; open (ORPHANS, '>list.of.markers.with.no.genotype.and.no.proxy.out') or die "\nCould not write the list of orphan markers!\n"; foreach my $orphan(keys %candidateList){ unless($printedmarker{$orphan}){print ORPHANS "$orphan\n";} } close ORPHANS;