package Bio::DB::CCDS;

use strict;
use warnings FATAL => 'all';

=head1 NAME

CCDS - The great new CCDS!

=head1 VERSION

Version 0.01

=cut

our $VERSION = '0.01';


=head1 SYNOPSIS

CCDS or Consensus Coding Sequences are protein coding regions that are identically annotated on the various
human and mouse genome assemblies, which are represented by accession numbers. They contain exon coordinates,
the gene name, Entrezgene ID, chromosome number, and Accession Number. Archive found at:
ftp://ftp.ncbi.nih.gov/pub/CCDS/

#!/usr/bin/perl -w
use strict;
use Bio::DB::CCDS

#Replace /CCDS.curr... with the full file path
my $test = Bio::DB::CCDS->new("/CCDS.current.2_22_15.txt");

#get a list of CCDS ids the correspond to a gene-id
my @ccds = $test->Get_CCDS_IDs('gene_id' => '118424');
print "@ccds\n";

#Get all the information that corresponds to a particular ID- chromosome#, accession, etc.
my @gene = $test->Get_gene("$ccds[0]");
print "@gene\n";
@gene = $test->gene_coordinates("$ccds[0]");
print "@gene\n";
my %exolocs = $test->Get_exons("$ccds[0]");
foreach(keys %exolocs){
	print "$_ - ";
	print "$exolocs{$_}\n";
}
@gene = $test->Genes_list('Withdrawn', @ccds);
print "@gene\n";
@thing = $test->Write_ccds($ccds[0]);
print "@gene\n";
@thing = $test->Get_strand('+');
print "@gene[0..20]\n";
@thing = $test->Get_strand('-');
print "@gene[0..20]\n";
@thing = $test->Get_strand('+', @ccds);
print "@gene here\n";
@thing = $test->Get_strand('-', @ccds);
print "@gene there\n";
@thing = $test->Gene_id('1', 'Withdrawn');
print "@gene\n";
print @gene."\n";
@thing = $test->Gene_count('1', '+', 'Withdrawn');
print "@gene\n";
print @gene."\n";

=head1 SUBROUTINES/METHODS
	new Get_CCDS_IDs Get_gene gene_coordinates Get_exons Genes_list Write_ccds Get_strand Gene_id Gene_count
=head2 new
 Title   : new
 Usage   : my $object = Bio::DB::CCDS->new($filepath);
 Function: Initialize a new CCDS object by passing in the CCDS filepath
 Returns : A new Bio::DB::CCDS object.
 Args    : path to file
=cut

my %ccds; # Keys are chromosome #, accession number, gene id, etc. Used for O(n) time searches to avoid traversing the hash tree.
my %ccdsids; # Separate hash for all of them because they never repeat.
use base qw(Bio::Root::Root);
my $proornuc;

sub new{
	my($class,@args) = @_;
	my $self = $class->SUPER::new(@args);
	my $filepath = shift @args;
	open(my $file, "$filepath")or die "Cannot open file";
	my $total = do{
		local $/ = undef;
		<$file>;
	};
	my @macorpc = split(/\r|\n/, $total);
	foreach my $key(split(/\t/, shift @macorpc)){
		chomp($key);
		if($key =~ m/accession/){
			$proornuc = $key;
		}
		my @temp;
		$ccds{"$key"} = \@temp;
	}
	my %chrs;
	push(@{$ccds{'#chromosome'}}, \%chrs);
	foreach my $line (@macorpc){
		my @temp = split(/\t/, $line);
		foreach my $column(@temp){chomp $column;}
		unless(exists($chrs{"$temp[0]"})){
			my %accessnum;
			$chrs{"$temp[0]"} = \%accessnum;
			my @back = ("$temp[0]", \%chrs);
			$accessnum{'back'} = \@back;			
			push(@{$ccds{"$proornuc"}}, \%accessnum);
			my %genename;
			$accessnum{"$temp[1]"} = \%genename;
			my @back1 = ("$temp[1]", \%accessnum);
			$genename{'back'} = \@back1;
			push(@{$ccds{'gene'}}, \%genename);
			my %gene_id;
			$genename{"$temp[2]"} = \%gene_id;
			my @back2 = ("$temp[2]", \%genename);
			$gene_id{'back'} = \@back2;
			push(@{$ccds{'gene_id'}}, \%gene_id);
			my %ccds_id;
			$gene_id{"$temp[3]"} = \%ccds_id;
			my @back3 = ("$temp[3]", \%gene_id);
			$ccds_id{'back'} = \@back3;
			push(@{$ccds{'ccds_id'}}, \%ccds_id);
			my @slice = @temp[5..10];
			$ccds_id{"$temp[4]"} = \@slice;
			$ccdsids{"$temp[4]"} = \%ccds_id;
			next;
		}
		my $acctemp = $chrs{"$temp[0]"};
		unless (exists(${$acctemp}{"$temp[1]"})){
			my %genename;
			${$acctemp}{"$temp[1]"} = \%genename;
			my @back = ("$temp[1]", $acctemp);
			$genename{'back'} = \@back;
			push(@{$ccds{'gene'}}, \%genename);
			my %gene_id;
			$genename{"$temp[2]"} = \%gene_id;
			my @back1 = ("$temp[2]", \%genename);
			$gene_id{'back'} = \@back1;
			push(@{$ccds{'gene_id'}}, \%gene_id);
			my %ccds_id;
			$gene_id{"$temp[3]"} = \%ccds_id;
			my @back2 = ("$temp[3]", \%gene_id);
			$ccds_id{'back'} = \@back2;
			push(@{$ccds{'ccds_id'}}, \%ccds_id);
			my @slice = @temp[5..10];
			$ccds_id{"$temp[4]"} = \@slice;
			$ccdsids{"$temp[4]"} = \%ccds_id;
			next;
		}
		my $genetemp = ${$acctemp}{"$temp[1]"};
		unless(exists(${$genetemp}{"$temp[2]"})){
			my %gene_id;
			${$genetemp}{"$temp[2]"} = \%gene_id;
			my @back = ("$temp[2]", $genetemp);
			$gene_id{'back'} = \@back;
			push(@{$ccds{'gene_id'}}, \%gene_id);
			my %ccds_id;
			$gene_id{"$temp[3]"} = \%ccds_id;
			my @back1 = ("$temp[3]", \%gene_id);
			$ccds_id{'back'} = \@back1;
			push(@{$ccds{'ccds_id'}}, \%ccds_id);
			my @slice = @temp[5..10];
			$ccds_id{"$temp[4]"} = \@slice;
			$ccdsids{"$temp[4]"} = \%ccds_id;
			next;
		}
		my $genidtemp = ${$genetemp}{"$temp[2]"};
		unless(exists(${$genidtemp}{"$temp[3]"})){
			my %ccds_id;
			${$genidtemp}{"$temp[3]"} = \%ccds_id;
			my @back = ("$temp[3]", $genidtemp);
			$ccds_id{'back'} = \@back;
			push(@{$ccds{'ccds_id'}}, \%ccds_id);
			my @slice = @temp[5..10];
			$ccds_id{"$temp[4]"} = \@slice;
			$ccdsids{"$temp[4]"} = \%ccds_id;
			next;
		}
		else{
			my $ccdstemp = ${$genidtemp}{"$temp[3]"};
			my @slice = @temp[5..10];
			${$ccdstemp}{"$temp[4]"} = \@slice;
			$ccdsids{"$temp[4]"} = $ccdstemp;
		}
	}
	close ($file);
	return $self;
}

=head2 Get_CCDS_IDs
 Title   : Get_CCDS_IDs
 Usage   : my @ccds = $object->Get_CCDS_IDs('#chromosome' => '1');
 Function: Takes two arguments, first is the type of the argument you want to send
 	   and the second argument is the actual argument you give to find the associated CCDS ids.
 Returns : A list of CCDS ids associated with the argument
 Args    :
 '#chromosome' => chromosome number
 'accession' => accession number
 'gene' => gene name
 'gene_id' or 'id' => gene id
=cut

sub Get_CCDS_IDs{
	my $self = shift;
	my ($type, $identifier) = @_;
	my $genenameref = 0;
	if ($type =~ m/accession/|| $type =~ m/ensembl/){
		if($identifier =~ m/^([A-Z]|[a-z]|_|0)+([0-9][0-9])./){
			my $two = int ("$2");
			$genenameref = ${${${$ccds{'#chromosome'}}[0]}{"$two"}}{"$identifier"};
		}
		my @CCDS_list;
		foreach my $genename(keys %{$genenameref}){
			unless("$genename" eq 'back'){
				my $geneidref = ${$genenameref}{"$genename"};
				foreach my $gene_id(keys %{$geneidref}){
					unless("$gene_id" eq 'back'){
						my $ccdsidref = ${$geneidref}{"$gene_id"};
						push (@CCDS_list, keys %{$ccdsidref});
					}
				}
			}
		}
		@CCDS_list = grep { "$_" ne 'back'} @CCDS_list;
		return @CCDS_list;
	}
	elsif ($type =~ m/chr/){
		my $accessnumref = ${${$ccds{'#chromosome'}}[0]}{"$identifier"};
		my @CCDS_list;
		foreach my $accessnum(keys %{$accessnumref}){
			unless("$accessnum" eq 'back'){
				my $genenameref = ${$accessnumref}{"$accessnum"};
				foreach my $genename(keys %{$genenameref}){
					unless("$genename" eq 'back'){
						my $geneidref = ${$genenameref}{"$genename"};
						foreach my $gene_id(keys %{$geneidref}){
							unless("$gene_id" eq 'back'){
								my $ccdsidref = ${$geneidref}{"$gene_id"};
								push (@CCDS_list, keys %{$ccdsidref});
							}
						}
					}
				}
			}
		}
		@CCDS_list = grep { "$_" ne 'back'} @CCDS_list;
		return @CCDS_list;
	}
	elsif ("$type" eq 'gene' || $type =~ m/name/){
		my $chr;
		foreach my $genehashref(@{$ccds{gene}}){
			if(exists(${$genehashref}{"$identifier"})){
				$chr = "${${${${$genehashref}{back}}[1]}{back}}[0]";
				last;
			}
		}
		my @CCDS_list;
		my $accessnumref = ${${$ccds{'#chromosome'}}[0]}{"$chr"};
		foreach my $accessnum(keys %{$accessnumref}){
			unless("$accessnum" eq 'back'){
				my $genenameref = ${$accessnumref}{"$accessnum"};
				my $geneidref = ${$genenameref}{"$identifier"};
				foreach my $gene_id(keys %{$geneidref}){
					unless("$gene_id" eq 'back'){
						my $ccdsidref = ${$geneidref}{"$gene_id"};
						push (@CCDS_list, keys %{$ccdsidref});
					}
				}
			}
		}
		@CCDS_list = grep { "$_" ne 'back'} @CCDS_list;
		return @CCDS_list;
	}
	elsif ($type =~ m/id/){
		my $chr;
		foreach my $geneidref(@{$ccds{'gene_id'}}){
			if(exists(${$geneidref}{"$identifier"})){
				$chr = "${${${${${${$geneidref}{'back'}}[1]}{'back'}}[1]}{'back'}}[0]";
				last;
			}
		}
		my @CCDS_list;
		my $accessnumref = ${${$ccds{'#chromosome'}}[0]}{"$chr"};
		foreach my $accessnum(keys %{$accessnumref}){
			unless("$accessnum" eq 'back'){
				my $genenameref = ${$accessnumref}{"$accessnum"};
				foreach my $genename(keys %{$genenameref}){
					unless("$genename" eq 'back'){
						my $geneidref = ${$genenameref}{"$genename"};
						my $ccdsidref = ${$geneidref}{"$identifier"};
						push (@CCDS_list, keys %{$ccdsidref});
					}
				}
			}
		}
		@CCDS_list = grep { "$_" ne 'back'} @CCDS_list;
		return @CCDS_list;
	}
}
=head2 Get_gene
 Title   : Get_gene
 Usage   : my @gene = $object->Get_gene($ccds[3]);
 Function: Gives you all the information associated with a CCDS id in order
 Returns : All information associated with the CCDS id
 Args    : CCDS id
=cut
sub Get_gene{
	my $self = shift;
	my $gene = shift;
	my %ccds_id = %{$ccdsids{"$gene"}};
	my @annotation;
	push(@annotation, "$gene");
	foreach my $rest(@{$ccds_id{"$gene"}}){
		push(@annotation, "$rest"); # Trying to avoid sending the actual array
	}
	unshift(@annotation, "${$ccds_id{'back'}}[0]");
	my %gene_id = %{${$ccds_id{'back'}}[1]};
	unshift(@annotation, "${$gene_id{'back'}}[0]");
	my %genename = %{${$gene_id{'back'}}[1]};
	unshift(@annotation, "${$genename{'back'}}[0]");
	my %accession = %{${$genename{'back'}}[1]};
	unshift(@annotation, "${$accession{'back'}}[0]");
	return @annotation;
}
=head2 gene_coordinates
 Title   : gene_coordinates
 Usage   : my @location = $object->gene_coordinates($ccds[3]);
 Function: Gives you the chromosome number, and position of the CCDS on the chromosome
 Returns : a list of the chromosome number, the start position, and the end location, in that order
 Args    : CCDS id
 Warning : Be aware of the accession number
=cut
sub gene_coordinates{
	my $self = shift;
	my @gene = $self->Get_gene(shift);
	my @coordinates;
	push(@coordinates, ($gene[0], $gene[7], $gene[8]));
	return @coordinates;
}
=head2 Get_exons
 Title   : Get_exons
 Usage   : my %exons = $object->Get_exons($ccds[3]);
	   foreach(keys %exons){
	   	print "$_ - ";
		print "$exons{$_}\n";
	   }
 Function: Gives you the exon coordinates. Every position will be subtracted by the starting position
	   so the first exon starts at 0, add the numbers to the starting position from gene_coordinates
	   for the actual location
 Returns : a hash of the exons the the keys as the beginnings of the exons and the values as the end
 Args    : CCDS id
=cut
sub Get_exons{ 
	my $self = shift;
	my @gene = $self->Get_gene(shift);
	my @exons;
	push(@exons, ($gene[0], $gene[7], $gene[8]));
	my @locations = split(/, /,$gene[9]);
	my %locs;
	foreach my $entry(@locations){
		chomp $entry;
		my @temp = split(/-/, $entry);
		if($temp[0] =~ m/$gene[7]/){
			my $difference = int("$temp[1]") - int ("$gene[7]");
			$locs{"0"} = "$difference";
		}
		elsif($temp[1] =~ m/$gene[8]/){
			my $difference1 = int ("$gene[8]") - int ("$gene[7]");
			my $difference2 = int ("$temp[0]") - int ("$gene[7]");
			$locs{"$difference2"} = "$difference1";
		}
		else{
			my $difference1 = int ("$temp[1]") - int ("$gene[7]");
			my $difference2 = int ("$temp[0]") - int ("$gene[7]");
			$locs{"$difference2"} = "$difference1";
		}
	}
	return %locs;
}
=head2 Genes_list
 Title   : Genes_list
 Usage   : my @customwithdrawn = $object->Genes_list('Withdrawn' => @ccds);
	   my @public = $object->Genes_list('Public');
 Function: You can choose whether to retrieve a list of public or withdrawn sequences from your own
	   CCDS id list or from the whole file
	   Returns all public CCDS ids by default
	   Does not return ids that are under review
 Returns : CCDS id list
 Args    :
 None by default
 First: 'Withdrawn' or 'Public'
 Second: CCDS id list or nothing
=cut
sub Genes_list{
	my $self = shift;
	my $arg = shift;
	my @custom;
	if(@_ == 0){
		@custom = keys %ccdsids;
	}
	else{
		@custom = @_;
	}
	if($arg =~ m/ithdraw/){
		$arg = "Withdrawn";
	}
	else{
		$arg = "Public";
	}
	my @list;
	foreach my $entry (@custom){
		if($arg =~ ${${$ccdsids{"$entry"}}{"$entry"}}[0]){
			push(@list, "$entry");
		}
	}
	return @list;
}
=head2 Write_ccds
 Title   : Write_ccds
 Usage   : print "$object->Genes_list($ccds[1])";
 Function: Gives you a tab separated string representation of everything associated with the CCDS id
 Returns : Tab formatted string representation of everything associated with the CCDS id
 Args    :
 CCDS id
=cut
sub Write_ccds{
	my $self = shift;
	my @gene = $self->Get_gene(shift);
	foreach my $entry(@gene){
		$entry .= "\t";
	}
	return "@gene\n"
}
=head2 Get_strand
 Title   : Get_strand
 Usage   : my @plus = $object->Get_strand('+' => @ccds);
 Function: Takes a list of CCDS ids and isolates the ids associated with either the sense
	   or antisense strands. Looks at all ids by default.
 Returns : list or CCDS ids that are sense or antisense
 Args    :
 First: '+' or '-'
 Second optional: list of CCDS ids to perform operation on
=cut
sub Get_strand{
	my $self = shift;
	my $arg = shift;
	if($arg =~ m/\+/){
		$arg = '+';
	}
	else{
		$arg = "-";
	}
	if(@_ == 0){
		my @genes;
		foreach my $refs(@{$ccds{'ccds_id'}}){
			foreach my $entry(keys %{$refs}){
				if(${${$refs}{"$entry"}}[1] eq "$arg"){
					push(@genes, "$entry");
				}
			}
		}
		return @genes;
	}
	else{
		my @custom = @_;
		my @list;
		foreach my $entry (@custom){
			if("$arg" eq "${${$ccdsids{$entry}}{$entry}}[1]"){
				push(@list, "$entry");
			}
		}
		return @list;
	}
}
=head2 Gene_id
 Title   : Gene_id
 Usage   : my @custom = $object->Gene_id('1', 'Withdrawn' => @ccds);
 Function: Get genes of a certain chromosome and status from your own
	   CCDS id list or from the whole file
 Returns : CCDS id list
 Args    :
 First: #chromosome
 Second: 'Withdrawn' or 'Public'
 Third: @ccds or nothing
=cut
sub Gene_id{
	my $self = shift;
	my $chr = shift;
	my $status = shift;
	my @chro = $self->Get_CCDS_IDs('#chromosome' => "$chr");
	my @stat;
	if (@_ == 0){
		@stat = $self->Genes_list("$status");
	}
	else{
		@stat = $self->Genes_list("$status", @_);
	}
	my @intersection;
	my %hash = map {$_ => 1} @chro;
	foreach my $entry (@stat){
		if(exists($hash{"$entry"})){
			push (@intersection, "$entry");
		}
	}
	return @intersection;
}
=head2 Gene_count
 Title   : Gene_count
 Usage   : my @custom = $object->Gene_count('1','+', 'Withdrawn' => @ccds);
 Function: Get genes of a certain chromosome, strand, and status from your own
	   CCDS id list or from the whole file
 Returns : CCDS id list
 Args    :
 First: #chromosome
 Second: '+' or '-'
 Third: 'Withdrawn' or 'Public'
 Fourth: @ccds or nothing
=cut
sub Gene_count{ # Get genes of chromosome, strand, and status, in that order - ('1', '+', 'Withdrawn')
				# For a custom list, use this, and take the intersect of it and your list
	my $self = shift;
	my $chr = shift;
	my $strand = shift;
	my $status = shift;
	my @genid;
	my @stran;
	if(@_ == 0){
		@genid = $self->Gene_id("$chr", "$status");
		@stran = $self->Get_strand("$strand");
	}
	else{
		@genid = $self->Gene_id("$chr", "$status", "@_");
		@stran = $self->Get_strand("$strand", "@_");
	}
	my @intersection;
	my %hash = map {$_ => 1} @genid;
	foreach my $entry (@stran){
		if(exists($hash{"$entry"})){
			push (@intersection, "$entry");
		}
	}
	return @intersection;
}


=head1 AUTHOR

Adur Pandya, C<< <anp375 at nyu.edu> >>
under supervision of Mgavi Elombe Brathwaite, 

=head1 BUGS

Please report any bugs or feature requests to C<bug-ccds at rt.cpan.org>, or through
the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=CCDS>.  I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.




=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc CCDS


You can also look for information at:

=over 4

=item * RT: CPAN's request tracker (report bugs here)

L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=CCDS>

=item * AnnoCPAN: Annotated CPAN documentation

L<http://annocpan.org/dist/CCDS>

=item * CPAN Ratings

L<http://cpanratings.perl.org/d/CCDS>

=item * Search CPAN

L<http://search.cpan.org/dist/CCDS/>

=back


=head1 ACKNOWLEDGEMENTS


=head1 LICENSE AND COPYRIGHT



This program is free software; you can redistribute it and/or modify it
under the terms of the the Artistic License (2.0). You may obtain a
copy of the full license at:

L<http://www.perlfoundation.org/artistic_license_2_0>

Any use, modification, and distribution of the Standard or Modified
Versions is governed by this Artistic License. By using, modifying or
distributing the Package, you accept this license. Do not use, modify,
or distribute the Package, if you do not accept this license.

If your Modified Version has been derived from a Modified Version made
by someone other than you, you are nevertheless required to ensure that
your Modified Version complies with the requirements of this license.

This license does not grant you the right to use any trademark, service
mark, tradename, or logo of the Copyright Holder.

This license includes the non-exclusive, worldwide, free-of-charge
patent license to make, have made, use, offer to sell, sell, import and
otherwise transfer the Package with respect to any patent claims
licensable by the Copyright Holder that are necessarily infringed by the
Package. If you institute patent litigation (including a cross-claim or
counterclaim) against any party alleging that the Package constitutes
direct or contributory patent infringement, then this Artistic License
to you shall terminate on the date that such litigation is filed.

Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


=cut

1; # End of CCDS

