Batch Creator Script
<perl>
- !/usr/bin/perl -w
- ----------------------------------------------------------------------------------
- 2010-06-11 Version 1.0
- INTRODUCTION:
- This code takes a template and merges it with a data file to create a batch
- files for loading microarray data into SMD [smd.stanford.edu]. The purpose
- is to automate the process of making annotations by having this script fill
- in fields that are always the same.
- >>Template File Construction:
- This file should include 2 rows of data with the first being the headers from
- "Result_Set_Name" to "PROBE_SET_ALGORITHM". The second row should data in any
- column that includes information that will never change for that template
- type. If you have multipule array types that you are loading you can make
- different templates.
- >>Data File Construction:
- This file should contain at least 2 rows (one for the headers and the other
- for the row of data that was annotated), but can be as many rows as you want.
- >>Note about files types:
- You can make the template and data file in excel, but they must be saved as
- a tab-delimited text file if you want them to work properly with this code.
- The resulting batch file will be named GSE####_batch.txt (where the #### will
- be replaced with the GSE number you give the command line).
- HOW THE CODE WORKS:
- This code will first look at your data file and see if the cells are filled.
- If they are not, it will check the template. If neither are present this code
- will throw an error telling you which column is missing data. Because the data
- file has priority over the template, you can overwrite sany template options
- without changing the template itself. However if you find yourself overwriting
- many of the template options it would be best to make an new template.
- >>Assumptions in this code
- This code assumes that Column C, O, and S are blank and do not read from the
- template or data files.
- >>Rules
- Listed at the bottom of this code is quick glance at what rules this code
- follows. As a rule of thumb fill in Columns D, K, N, P, T, and U for the template
- and Columns E, F, G, Q, and R for the data file.
- QUESTIONS
- Contact me at amandasupak@gmail.com
- ----------------------------------------------------------------------------------
- use strict;
use Getopt::Long; use Data::Dumper;
my ($template_content, $data_file, $gse_number, $result, @template_lines);
$result = GetOptions ( "t|template=s" => \$template_file, "d|data=s" => \$data_file, "g|gse_number=s" => \$gse_number ); die "needs a template file " unless $template_file; die "neeeds a data file " unless $data_file; die "needs a GSE number! " unless $gse_number;
- Read template into a scalar (to ignore line endings)
{ local $/; open (TEMPLATE, $template_file) or die "Couldn't open file: $!"; $template_content = <TEMPLATE>; close TEMPLATE; }
- Remove Excel style line endings (window's endings)
$template_content =~ s!\r! !g; @template_lines = split / /, $template_content;
- Print to the screen that the data is being processed
print " Processing... ";
- Strips "GSE" from "GSE#####" when typed in the promt "-g"
$gse_number=~ s/GSE(\d+)/$1/; print "$gse_number ";
- split the template into columns, only on the second line (the first is the headings)
my @template = split /\t/, $template_lines[1];
- open a file to print to
open (OUTPUT, '>GSE' . $gse_number . '_batch.txt') or die $!;
- Put the headers from the template into the output file
print OUTPUT $template_lines[0] . " ";
- Read the data file into a scalar
{ local $/; open (DATA, $data_file) or die "Couldn't open file: $!"; $data_content = ; close DATA; } $data_content =~ s!\r! !g; my @data_lines = split / /, $data_content;
my @data;
my $i = 0; foreach (@data_lines) {
if ( $i ==0 ) { $i++; next; }
next if /^\s*$/;
@data = split /\t/, $_;
my @fields_to_print;
# (A) result_set_name if ($gse_number =~ m/^GSE/i) { $gse_number =~ s/^GSE//; } $fields_to_print[0] = 'GSE' . $gse_number . '_ecolihub';
# (B) result_set_description $fields_to_print[1] = $fields_to_print[0];
# (C) add_to_exp (blank) $fields_to_print[2] = "";
# (D) print_name $fields_to_print[3] = getColumn(3);
# (E) experiment_category $fields_to_print[4] = getColumn(4);
# (F) experiment_subcategory $fields_to_print[5] = getColumn(5);
# (G) slide_name $fields_to_print[6] = getColumn(6);
# (H) exp_file_location $fields_to_print[7] = getColumn(7);
# (I) cel_file_location $fields_to_print[8] = $fields_to_print[6] . '.CEL';
# (J) gene_file_location $fields_to_print[9] = $fields_to_print[6] . '.CEL.chp.txt';
# (K) single_scan_file_location $fields_to_print[10] = getColumn(10);
# (L) single_channel_description my $s = getColumn(17); if ( $s =~ /^"(.*)"$/ ) { # strip off the leading and trailing quotes added by Excel $s = $1; } if ( $s =~ /^(.*)<c-anno>/ ) { # grab off the first part $fields_to_print[11] = $1; } else { print STDERR "Please check the experiment description (Column R). It must include a description of the experiment before the <c-anno> tag. "; }
# (M) experiment_name $fields_to_print[12] = $fields_to_print[11];
# (N) normalization $fields_to_print[13] = getColumn(13);
# (O) norm_value $fields_to_print[14] = ;
# (P) experimenter $fields_to_print[15] = getColumn(15);
# (Q) date my $date = getColumn(16); if ( $date !~ /^(?:19|20)\d\d-(?:0[1-9]|1[012])-(?:0[1-9]|[12][0-9]|3[01])$/ ) { # match a date like YYYY-MM-DD print STDERR "Date in the wrong format! Please use YYYY-MM-DD format. "; $date = ; } $fields_to_print[16] = $date;
# (R) experiment_description $fields_to_print[17] = $s; # from (L)
# (S) collaborative_group $fields_to_print[18] = "";
# (T) individual_user $fields_to_print[19] = getColumn(19);
# (U) probe_set_algorithm $fields_to_print[20] = getColumn(20);
# now we have @fields_to_print, make into scalar and print to the screen my $x = join("\t", @fields_to_print); print OUTPUT $x . " "; print "$fields_to_print[6] ";
#for (my $j=0, $c=scalar(@fields_to_print); $j<=$c; $j++) {
# printf("%d,\t%s,\t%s,\t%s
",$j, $data[$j], $template[$j], $fields_to_print[$j] );
#}
#die();
$i++; } close DATA;
- Determines if the data file or the template file has priority
sub getColumn { my $column_index = shift @_;
if ( !$data[$column_index] && !$template[$column_index] ) { print STDERR "Column " . $column_index . " doesn't have any values in data or template! "; return; }
return ($data[$column_index]) ? $data[$column_index] : $template[$column_index]; }
my $r = $i -1; print "Created $r row(s) ";
__END__
- Rules:
Column A is taken from the template and user input (GSE####) Column B is (column A) Column I is (column G + .CEL) Column J is (column G + .CEL.chp.txt) Column H is taken from template unless specified by user (.EXP files) Column L is taken from text of Column R before "<c-anno>" Column M is taken from Column L</perl>