Line: 1 to 1 | ||||||||
---|---|---|---|---|---|---|---|---|
Changes need to port OSG modifications to GRAM5 jobmanager-condorOverview | ||||||||
Line: 7 to 7 | ||||||||
-- IgorSfiligoi - 2009/11/16 | ||||||||
Changed: | ||||||||
< < | The changes implemented to the UCSD GT2 condor.pm file from the GRAM5 Beta1 condor.in file are as follows (bear in mind, this is not currently a working file and still requires further patching). | |||||||
> > | A) Differences between the "vanilla" GRAM5 condor.in file and the OSG/VDT "vanilla" condor.in file: | |||||||
Changed: | ||||||||
< < | - (line 9) | |||||||
> > | The GRAM5 file specifies its configuration file differently, instead they make the following changes:
28a9,10 > use Config; > 34c16 < my ($condor_submit, $condor_rm, $condor_config); --- > my ($condor_submit, $condor_rm); 40,48d21 < $condor_config = '@CONDOR_CONFIG@'; < $condor_check_vanilla_files < = '@CONDOR_CHECK_VANILLA_FILES@'; < $condor_mpi_script = '@CONDOR_MPI_SCRIPT@'; < < if ($condor_config ne '') < { < $ENV{CONDOR_CONFIG} = $condor_config; < } | |||||||
Changed: | ||||||||
< < | use Config; | |||||||
> > | VDT makes individual condor log files for pre-WS GRAM jobs:
61a35,43 > # We want to have individual Condor log files for each job for > # pre-WS GRAM, but still have a single log file for WS GRAM > # (which uses the SEG to monitor job status). > if ( defined( $description->factoryendpoint() ) ) { > $self->{individual_condor_log} = 1; > } else { > $self->{individual_condor_log} = 0; > } > 72c54 < last; --- > break; 78c60 < if (! exists($self->{condor_logfile})) --- > if (! exists($self->{condor_logfile}) || $self->{individual_condor_log}) 88c70,75 < $self->{condor_logfile} = "$log_dir/gram_condor_log"; --- > if ( $self->{individual_condor_log} ) { > $self->{condor_logfile} = "$log_dir/gram_condor_log." > . $description->uniq_id(); > } else { > $self->{condor_logfile} = "$log_dir/gram_condor_log"; > } 90c77 < if(! -e $self->{condor_logfile}) --- > if ((! -e $self->{condor_logfile}) && ($self->{individual_condor_log} == 0)) 583a544,552 > > # Should we really delete a file that we have not > # been able to read. This leaves room for a race > # condition So the following code is commented out: > > #if ( ${self}->{individual_condor_log} ) { > # $self->log_to_gratia(); > # unlink($self->{condor_logfile}); > #} 589c558,559 < $state = Globus::GRAM::JobState::FAILED; --- > # Don't delete the condor log when not reporting DONE > return Globus::GRAM::Error::SYSTEM_CANCELLED(); 597a568,571 > if ( ${self}->{individual_condor_log} ) { > $self->log_to_gratia(); > unlink($self->{condor_logfile}); > } | |||||||
Changed: | ||||||||
< < | -/+ (line 16) my ($condor_submit, $condor_rm, $condor_config); | |||||||
> > | VDT specifies between 64 and 32-bit libraries allowing you to use either:
133a121 > my %library_vars; 211a196,201 > $library_vars{LD_LIBRARY_PATH} = 0; > if($Config{osname} eq 'irix') > { > $library_vars{LD_LIBRARYN32_PATH} = 0; > $library_vars{LD_LIBRARY64_PATH} = 0; > } 221a245,254 > > foreach (keys %library_vars) > { > my $library_path = join(':', $description->library_path()); > > if($library_vars{$_} == 0) > { > push(@environment, [$_, $library_path]); > } > } | |||||||
Changed: | ||||||||
< < | + (line 22)
$condor_config = '@CONDOR_CONFIG@'; $condor_check_vanilla_files = '@CONDOR_CHECK_VANILLA_FILES@'; $condor_mpi_script = '@CONDOR_MPI_SCRIPT@'; if ($condor_config ne '') { $ENV{CONDOR_CONFIG} = $condor_config; } | |||||||
> > | VDT/OSG specific modification:
216,219c205,241 < if(ref($tuple) || scalar(@$tuple) = 2) < { < return Globus::GRAM::Error::RSL_ENVIRONMENT(); < } --- > if(ref($tuple) || scalar(@$tuple) = 2) > { > return Globus::GRAM::Error::RSL_ENVIRONMENT(); > } > if(exists($library_vars{$tuple->[0]})) > { > $tuple->[1] .= ":$library_string"; > $library_vars{$tuple->[0]} = 1; > } > } > > ##### OSG-Specific modification ##### > ##### These should not affect any non-OSG site, ##### > ##### unless you define $OSG_GRID ##### > > # First, we figure out if this is an OSG installation, and if so, where > # OSG is installed on the worker nodes > my $osg_grid = ''; > my $use_osg_grid = 1; > my $use_dynamic_wn_tmp = 1; > map { > if ($_->[0] eq "OSG_GRID") { > $osg_grid = $_->[1]; > } elsif ($_->[0] eq "OSG_DONT_USE_OSG_GRID_FOR_GL") { > $use_osg_grid = 0; > } > > } @environment; > > # If this is an OSG installation, we set GLOBUS_LOCATION based on OSG_GRID, > # and we set OSG_WN_TMP based on _CONDOR_SCRATCH_DIR > if ($osg_grid ne '') { > map { > if ($use_osg_grid && $_->[0] eq "GLOBUS_LOCATION") { > $_->[1] = $osg_grid . "/globus"; > } > } @environment; 220a243 > ##### End OSG-Specific modification ##### | |||||||
Changed: | ||||||||
< < | + (line 150)
elsif($description->jobtype() eq 'mpi' && $condor_mpi_script ne 'no') { $universe = 'parallel'; } | |||||||
> > | GRAM5 moved this portion of code and added a line for the "parallel" universe case:
233,248d265 < if ($description->directory() =~ m|^[^/]|) < { < my $home = (getpwuid($<))[7]; < < $description->add('directory', "$home/".$description->directory()); < } < if ($description->executable() =~ m|^[^/]|) < { < $description->add('executable', < $description->directory() . '/' . $description->executable()); < } < if ($universe eq 'parallel') < { < unshift(@arguments, $description->executable); < $description->add('executable', $condor_mpi_script); < } 251c268 < $argument_string = '"' . join(' ', --- > $argument_string = join(' ', 254,256c271,272 < $_ =~ s/'/''/g; < $_ =~ s/"/""/g; < $_ = "'$_'"; --- > $_ =~ s/"/\\\"/g; #" > $_; 258c274 < @arguments) . '"'; --- > @arguments); 295a312,323 > if ($description->directory() =~ m|^[^/]|) > { > my $home = (getpwuid($<))[7]; > > $description->add('directory', "$home/".$description->directory()); > } > if ($description->executable() =~ m|^[^/]|) > { > $description->add('executable', > $description->directory() . '/' . $description->executable()); > } > | |||||||
Changed: | ||||||||
< < | -/+ (line 176)
if($universe eq 'standard' || $condor_check_vanilla_files eq 'yes') | |||||||
> > | VDT patch for gratia:
320a349,351 > # Patched by the VDT > print SCRIPT_FILE "+GratiaJobOrigin=\"GRAM\"\n"; > # End VDT patch 646a624,673 | |||||||
Changed: | ||||||||
< < | -/+ (line 203)
foreach my $tuple ($description->environment()) | |||||||
> > | GRAM5 made a lot of patches to the SCRIPT_FILE section:
324,404c355,363 < my $shouldtransferfiles = $description->shouldtransferfiles(); < if (defined($shouldtransferfiles)) < { < $self->log("Adding \"should_transfer_files = $shouldtransferfiles\"\n"); < print SCRIPT_FILE "should_transfer_files = $shouldtransferfiles\n"; < } < < my $WhenToTransferOutput = $description->whentotransferoutput(); < if (defined($WhenToTransferOutput)) < { < $self->log("Adding \"WhenToTransferOutput = $WhenToTransferOutput\"\n"); < print SCRIPT_FILE "WhenToTransferOutput = $WhenToTransferOutput\n"; < } < < my $transfer_input_files = $description->transferinputfiles(); < if (defined($transfer_input_files)) < { < $self->log("Adding explicitly \"transfer_input_files = " < ."$transfer_input_files\"\n"); < print SCRIPT_FILE "transfer_input_files = $transfer_input_files\n"; < } < else < { < my @transfer_input_files = $description->transferinputfiles(); < if (defined($transfer_input_files[0])) < { < my $file_list_string = ""; < foreach my $file (@transfer_input_files) < { < $file_list_string .= "$file, "; < } < $file_list_string < $self->log("Adding \"transfer_input_files = $file_list_string\"\n"); < print SCRIPT_FILE "transfer_input_files = $file_list_string\n"; < } < } < < my $transfer_output_files = $description->transferoutputfiles(); < if (defined($transfer_output_files)) < { < $self->log("Adding explicitly \"transfer_output_files = " < ."$transfer_output_files\"\n"); < print SCRIPT_FILE "transfer_output_files = $transfer_output_files\n"; < } < else < { < my @transfer_output_files = $description->transferoutputfiles(); < if (defined($transfer_output_files[0])) < { < my $file_list_string = ""; < foreach my $file (@transfer_output_files) < { < $file_list_string . "$file, "; < } < $file_list_string =~ s/, $//; < $self->log("Adding \"transfer_output_files = " < ."$file_list_string\"\n"); < print SCRIPT_FILE "transfer_output_files = $file_list_string\n"; < } < } < < if ($universe eq 'parallel') < { < print SCRIPT_FILE "Output = " . $description->stdout() . "\n"; < print SCRIPT_FILE "Output = " . $description->stderr() . "\n"; < print SCRIPT_FILE "machine_count = " . $description->count() . "\n"; < print SCRIPT_FILE "queue\n"; < } < else < { < for (my $i = 0; $i < $description->count(); $i++) { < if ($multi_output) { < print SCRIPT_FILE "Output = " . < $self->{STDIO_MERGER}->add_file('out') . "\n"; < print SCRIPT_FILE "Error = " . < $self->{STDIO_MERGER}->add_file('err') . "\n"; < } else { < print SCRIPT_FILE "Output = " . $description->stdout() . "\n"; < print SCRIPT_FILE "Error = " . $description->stderr() . "\n"; < } < print SCRIPT_FILE "queue 1\n"; --- > for (my $i = 0; $i < $description->count(); $i++) { > if ($multi_output) { > print SCRIPT_FILE "Output = " . > $self->{STDIO_MERGER}->add_file('out') . "\n"; > print SCRIPT_FILE "Error = " . > $self->{STDIO_MERGER}->add_file('err') . "\n"; > } else { > print SCRIPT_FILE "Output = " . $description->stdout() . "\n"; > print SCRIPT_FILE "Error = " . $description->stderr() . "\n"; 405a365 > print SCRIPT_FILE "queue 1\n"; 435c395 < $failure_text = join("", @response_text); --- > $failure_text = join(//, @response_text); | |||||||
Changed: | ||||||||
< < | -/+ (line 268)
$argument_string = '"' . join(' ', map { $_ =~ s/'/''/g; $_ =~ s/"/""/g; $_ = "'$_'"; } @arguments) . '"'; | |||||||
> > | B) NFS-lite changes made to the condor.pm file: | |||||||
Changed: | ||||||||
< < | -/+ (line 355)
my $shouldtransferfiles = $description->shouldtransferfiles(); if (defined($shouldtransferfiles)) { $self->log("Adding \"should_transfer_files = $shouldtransferfiles\"\n"); print SCRIPT_FILE "should_transfer_files = $shouldtransferfiles\n"; } my $WhenToTransferOutput = $description->whentotransferoutput(); if (defined($WhenToTransferOutput)) { $self->log("Adding \"WhenToTransferOutput = $WhenToTransferOutput\"\n"); print SCRIPT_FILE "WhenToTransferOutput = $WhenToTransferOutput\n"; } my $transfer_input_files = $description->transferinputfiles(); if (defined($transfer_input_files)) { $self->log("Adding explicitly \"transfer_input_files = "."$transfer_input_files\"\n"); print SCRIPT_FILE "transfer_input_files = $transfer_input_files\n"; } else { my @transfer_input_files = $description->transferinputfiles(); if (defined($transfer_input_files[0])) { my $file_list_string = ""; foreach my $file (@transfer_input_files) { $file_list_string .= "$file, "; } $file_list_string $self->log("Adding \"transfer_input_files = $file_list_string\"\n"); print SCRIPT_FILE "transfer_input_files = $file_list_string\n"; } } my $transfer_output_files = $description->transferoutputfiles(); if (defined($transfer_output_files)) { $self->log("Adding explicitly \"transfer_output_files = "."$transfer_output_files\"\n"); print SCRIPT_FILE "transfer_output_files = $transfer_output_files\n"; } else { my @transfer_output_files = $description->transferoutputfiles(); if (defined($transfer_output_files[0])) { my $file_list_string = ""; foreach my $file (@transfer_output_files) { $file_list_string . "$file, "; } $file_list_string =~ s/, $//; $self->log("Adding \"transfer_output_files = "."$file_list_string\"\n"); print SCRIPT_FILE "transfer_output_files = $file_list_string\n"; } } if ($universe eq 'parallel') { print SCRIPT_FILE "Output = " . $description->stdout() . "\n"; print SCRIPT_FILE "Output = " . $description->stderr() . "\n"; print SCRIPT_FILE "machine_count = " . $description->count() . "\n"; print SCRIPT_FILE "queue\n"; } else { for (my $i = 0; $i < $description->count(); $i++) { if ($multi_output) { print SCRIPT_FILE "Output = " . $self->{STDIO_MERGER}->add_file('out') . "\n"; print SCRIPT_FILE "Error = " . $self->{STDIO_MERGER}->add_file('err') . "\n"; } else { print SCRIPT_FILE "Output = " . $description->stdout() . "\n"; print SCRIPT_FILE "Error = " . $description->stderr() . "\n"; } print SCRIPT_FILE "queue 1\n"; } } | |||||||
> > | 134,137d127 226,232c216,236 386,417c351 740,764d673 | |||||||
Changed: | ||||||||
< < | These are the GRAM5 changes, below are the NSF-Lite changes | |||||||
> > | ||||||||
Changed: | ||||||||
< < | + (line 28)
my $isNFSLite = 1; # Flag to tell if we are using NFS lite. 1 or true for yes my $scratch_isset = 0; # Flag if the SCRATCH_DIRECTORY environment variable is set indicating likely GRAM job | |||||||
> > | | |||||||
Changed: | ||||||||
< < | -/+ (line 216)
# NFS lite start if ($isNFSLite) { my $osg_grid = ''; my $use_osg_grid = 1; map { if ($_->[0] eq "OSG_GRID") { $osg_grid = $_->[1]; } elsif ($_->[0] eq "OSG_DONT_USE_OSG_GRID_FOR_GL") { $use_osg_grid = 0; } elsif ($_->[0] eq "LOGNAME") { $logname = $_->[1]; } elsif ($_->[0] eq "SCRATCH_DIRECTORY") { $scratch_isset = 1; $scratch_directory = $_->[1]; $_->[1] = '$_CONDOR_SCRATCH_DIR'; } elsif ($_->[0] eq "X509_USER_PROXY") { $_->[0] = "CHANGED_X509"; } } @environment; # If this is an OSG installation, we set GLOBUS_LOCATION based on OSG_GRID if ($osg_grid ne '') { map { if ($use_osg_grid && $_->[0] eq "GLOBUS_LOCATION") { $_->[1] = $osg_grid . "/globus"; } } @environment; } if ($scratch_isset) { # Remote_InitialDir apparently suppresses the setting of the SCRATCH_DIRECTORY env variable $wrapper_arguments .= " -wrapper_iwd " . ' $_CONDOR_SCRATCH_DIR'; # UCSD Mod push(@environment,["MY_INITIAL_DIR",'$_CONDOR_SCRATCH_DIR']); } elsif ( $description->directory() # If the directory ends with the logname it might be a globus-job-run job # so take control of the initial_dir $wrapper_arguments . " -wrapper_iwd " . ' $_CONDOR_SCRATCH_DIR'; # UCSD Mod push(@environment,["MY_INITIAL_DIR",'$_CONDOR_SCRATCH_DIR'] ); } else { # assume that remote_initialdir is set and the submitter knows what they are # doing. $wrapper_arguments .= " -wrapper_iwd " . $description->directory(); # UCSD Mod push(@environment,["MY_INITIAL_DIR",$description->directory()] ); } } # NFS Lite End # START UCSD Mods #if ($logname =~ /.*cms.*/) { $AccountingGroup = "group_cms." . $logname; } | |||||||
> > | C) UCSD Specific changes made to the condor.pm file: | |||||||
Changed: | ||||||||
< < | -/+ (line 334)
print SCRIPT_FILE "Requirements = OpSys? = \"LINUX\" && (Arch \"X86_64\" || Arch =\"INTEL\")\n"; # UCSD Mod+ (line 345) # START UCSD Modification print SCRIPT_FILE "Arguments = $argument_string $wrapper_arguments\n"; # UCSD added wrapper args # END UCSD Modification+ (line 349) $description->save("/var/tmp/description-". $description->uniq_id() . ".desc"); print SCRIPT_FILE "+AccountingGroup = \"$AccountingGroup\"\n"; # UCSD # NFS Lite mode if ($isNFSLite) { print SCRIPT_FILE "should_transfer_files = YES\n"; print SCRIPT_FILE "when_to_transfer_output = ON_EXIT\n"; print SCRIPT_FILE "transfer_output = true\n"; # Lets test to make sure the scratch directory exists if ( -d "$scratch_directory" ) { $self->logMe("$scratch_directory found!"); } else { $self->logMe("$scratch_directory NOT found!"); } # GRAM Files to transfer to the worker node # Only do this if we are dealing with a GRAM job that has set up a scratch area # otherwise we assume it is a globus-job-run or the users is using remote_initialdir $self->logMe($scratch_directory); if ( $scratch_isset && !( $self->isWSGramGlobus() ) ) { $self->logMe("Passed the test"); if ( $self->isWSGramCondorG() ) { $scratch_directory = $description->directory(); $self->logMe($scratch_directory); } my $sdir; $self->logMe($scratch_directory); opendir($sdir,$scratch_directory); my @sfiles = grep { !/^\./} readdir($sdir); $self->logMe(@sfiles); close $sdir; print SCRIPT_FILE "transfer_input_files = "; SFILE: foreach $f ( @sfiles ) { $f =~ s{\/\/}{\/}g; $f = $scratch_directory . "/" . $f; next SFILE if $f eq $description->executable(); next SFILE if $f eq $description->stdin(); next SFILE if $f eq $description->stdout(); push (@flist,"$f"); } print SCRIPT_FILE join(",",@flist); print SCRIPT_FILE "\n"; } } # End NFS Lite Mode+ (line 354) # UCSD Mode, placed here to over ride the attempt by users to override the following # 36 hours print SCRIPT_FILE "maxRunTime = 129600\n"; # Two Weeks print SCRIPT_FILE "maxQTime = 1209600\n"; print SCRIPT_FILE "periodic_remove = (RemoteWallClockTime? > \$(maxRunTime)) || ((QDate - CurrentTime? ) > \$(maxQTime))\n"; # UCSD Mode Ends+ line (370) ### UCSD MOD ### # Save the script we make my $tmpfname = "/var/tmp/script-" . $description->uniq_id() . ".script"; system("/bin/cp $script_filename $tmpfname"); ### END UCSD MOD ###+ (line 625) my $condor_version_number = 0; + (line 627) unless ($condor_version_number) { my $condor_version_string = `condor_version 2>/dev/null`; $condor_version_number = join("", map { m&^(\d+)&?sprintf("%03d", $1):"000" } ($condor_version_string =~ m!^\$CondorVersion: ([^\.]+)\.([^\.]+)\.([^\.]+)!s)); return 1 unless ($condor_version_number < 6009000); }-/+ (line 659) sub isWSGramCondorG { my $self = shift; my $description = $self->{JobDescription}; $self->logMe("Inside isWSGramCondorG"); my $jobcredentialendpoint = ""; $jobcredentialendpoint = $description->jobcredentialendpoint(); $self->logMe("Got Job Credential $jobcredentialendpoint"); if ( !($self->isWSGramGlobus() ) && ($jobcredentialendpoint ne "") ) { $self->logMe("Inside isWSGramCondorG test true"); return 1; } else { $self->logMe("Failed to get job credential"); return 0; }} sub logMe { my $self = shift; my $description = $self->{JobDescription}; my $entry = shift; my $unid = $description->uniq_id(); open(LOG,">>/var/tmp/logme-$unid.log") || die "Could not open file /var/tmp/logme-$unid.log:$!\n"; print LOG $entry; print LOG "\n"; close LOG;} sub isWSGramGlobus { my $self = shift; my $description = $self->{JobDescription}; $self->logMe("Inside isWSGramGlobus"); my $extensions = $description->extensions(); $self->logMe("Extensions $extensions"); if ($extensions =~ /globusrun/) { $self->logMe("Yes WSGramGlobus? "); return 1; } else { $self->logMe("No WSGramGlobus? "); return 0; }} | |||||||
> > | ||||||||
-- ChristopherTheissen - 2009/11/25 |