Changes need to port OSG modifications to GRAM5 jobmanager-condor

Overview

OSG uses a patched/augmented version jobmanager-condor (condor.in->condor.pm). The jobmanager-condor changed slightly between GT2 and GRAM5, so we had to merge the two.

-- IgorSfiligoi - 2009/11/16

A) Differences between the "vanilla" GRAM5 condor.pm file and the OSG/VDT "vanilla" condor.pm file:

This is in terms of "diff GRAM5condor.pm OSG/VDTcondor.pm"

The GRAM5 file specifies its configuration file differently, instead they make the following changes:

28a9,10
- use Config;
-
34c16
+ my ($condor_submit, $condor_rm, $condor_config);
---
- my ($condor_submit, $condor_rm);
40,48d21
+ $condor_config = '@CONDOR_CONFIG@';
+ $condor_check_vanilla_files
+ = '@CONDOR_CHECK_VANILLA_FILES@';
+ $condor_mpi_script = '@CONDOR_MPI_SCRIPT@';
+
+ if ($condor_config ne '')
+ {
+ $ENV{CONDOR_CONFIG} = $condor_config;
+ }

VDT makes individual condor log files for pre-WS GRAM jobs:

61a35,43
+ # We want to have individual Condor log files for each job for
- # pre-WS GRAM, but still have a single log file for WS GRAM
- # (which uses the SEG to monitor job status).
- if ( defined( $description->factoryendpoint() ) ) {
- $self->{individual_condor_log} = 1;
- } else {
- $self->{individual_condor_log} = 0;
- }
-
72c54
+ last;
---
- break;
78c60
+ if (! exists($self->{condor_logfile}))
---
- if (! exists($self->{condor_logfile}) || $self->{individual_condor_log})
88c70,75
+ $self->{condor_logfile} = "$log_dir/gram_condor_log";
---
- if ( $self->{individual_condor_log} ) {
- $self->{condor_logfile} = "$log_dir/gram_condor_log."
- . $description->uniq_id();
- } else {
- $self->{condor_logfile} = "$log_dir/gram_condor_log";
- }
90c77
+ if(! -e $self->{condor_logfile})
---
- if ((! -e $self->{condor_logfile}) && ($self->{individual_condor_log} == 0))
583a544,552
-
- # Should we really delete a file that we have not
- # been able to read. This leaves room for a race
- # condition So the following code is commented out:
-
- #if ( ${self}->{individual_condor_log} ) {
- # $self->log_to_gratia();
- # unlink($self->{condor_logfile});
- #}
589c558,559
+ $state = Globus::GRAM::JobState::FAILED;
---
- # Don't delete the condor log when not reporting DONE
- return Globus::GRAM::Error::SYSTEM_CANCELLED();
597a568,571
- if ( ${self}->{individual_condor_log} ) {
- $self->log_to_gratia();
- unlink($self->{condor_logfile});
- }

VDT specifies between 64 and 32-bit libraries allowing you to use either:

133a121
- my %library_vars;
211a196,201
- $library_vars{LD_LIBRARY_PATH} = 0;
- if($Config{osname} eq 'irix')
- {
- $library_vars{LD_LIBRARYN32_PATH} = 0;
- $library_vars{LD_LIBRARY64_PATH} = 0;
- }
221a245,254
-
- foreach (keys %library_vars)
- {
- my $library_path = join(':', $description->library_path());
-
- if($library_vars{$_} == 0)
- {
- push(@environment, [$_, $library_path]);
- }
- }

VDT/OSG specific modification:

216,219c205,241
+ if(ref($tuple) || scalar(@$tuple) = 2)
+ {
+ return Globus::GRAM::Error::RSL_ENVIRONMENT();
+ }
---
- if(ref($tuple) || scalar(@$tuple) = 2)
- {
- return Globus::GRAM::Error::RSL_ENVIRONMENT();
- }
- if(exists($library_vars{$tuple->[0]}))
- {
- $tuple->[1] .= ":$library_string";
- $library_vars{$tuple->[0]} = 1;
- }
- }
-
- ##### OSG-Specific modification #####
- ##### These should not affect any non-OSG site, #####
- ##### unless you define $OSG_GRID #####
-
- # First, we figure out if this is an OSG installation, and if so, where
- # OSG is installed on the worker nodes
- my $osg_grid = '';
- my $use_osg_grid = 1;
- my $use_dynamic_wn_tmp = 1;
- map {
- if ($_->[0] eq "OSG_GRID") {
- $osg_grid = $_->[1];
- } elsif ($_->[0] eq "OSG_DONT_USE_OSG_GRID_FOR_GL") {
- $use_osg_grid = 0;
- }
-
- } @environment;
-
- # If this is an OSG installation, we set GLOBUS_LOCATION based on OSG_GRID,
- # and we set OSG_WN_TMP based on _CONDOR_SCRATCH_DIR
- if ($osg_grid ne '') {
- map {
- if ($use_osg_grid && $_->[0] eq "GLOBUS_LOCATION") {
- $_->[1] = $osg_grid . "/globus";
- }
- } @environment;
220a243
- ##### End OSG-Specific modification #####

GRAM5 moved this portion of code and added a line for the "parallel" universe case:

233,248d265
+ if ($description->directory() =~ m|^[^/]|)
+ {
+ my $home = (getpwuid($<))[7];
+
+ $description->add('directory', "$home/".$description->directory());
+ }
+ if ($description->executable() =~ m|^[^/]|)
+ {
+ $description->add('executable',
+ $description->directory() . '/' . $description->executable());
+ }
+ if ($universe eq 'parallel')
+ {
+ unshift(@arguments, $description->executable);
+ $description->add('executable', $condor_mpi_script);
+ }
251c268
+ $argument_string = '"' . join(' ',
---
- $argument_string = join(' ',
254,256c271,272
+ $_ =~ s/'/''/g;
+ $_ =~ s/"/""/g;
+ $_ = "'$_'";
---
- $_ =~ s/"/\\\"/g; #"
- $_;
258c274
+ @arguments) . '"';
---
- @arguments);
295a312,323
- if ($description->directory() =~ m|^[^/]|)
- {
- my $home = (getpwuid($<))[7];
-
- $description->add('directory', "$home/".$description->directory());
- }
- if ($description->executable() =~ m|^[^/]|)
- {
- $description->add('executable',
- $description->directory() . '/' . $description->executable());
- }
-

VDT patch for gratia:

320a349,351
- # Patched by the VDT
- print SCRIPT_FILE "+GratiaJobOrigin=\"GRAM\"\n";
- # End VDT patch
646a624,673
- # Patched by the VDT
- sub log_to_gratia
- {
- my $self = shift;
- my $log_filename = $self->{condor_logfile};
-
- # Select the gratia location
- my $env = "$ENV{VDT_LOCATION}";
- if ( "x$env" eq "x" ) {
- $env = "$ENV{GLOBUS_LOCATION}/..";
- }
- if ( "x$env" eq "x" ) {
- $env = "/var/tmp";
- }
- my $log_dir = "$env/gratia/var/data/";
-
- if ( -d $log_dir ) {
- # For now assume that the existence of the directory means that
- # accounting is enabled.
- if ( -r $log_filename ) {
- $self->log("Logging for accounting purpose the file $log_filename into $log_dir");
- @args = ("cp", "$log_filename" , "$log_dir" );
- system(@args) == 0 or $self->log("Error: system @args failed: $?");
- } else {
- if ( ! -e $log_filename ) {
- $self->log("Logging for accounting purpose failed: $log_filename does not exist");
- } else {
- $self->log("Logging for accounting purpose failed: can not read the file $log_filename");
- }
- return 0; # should return a proper Globus failure code.
- }
- }
- return 1; # Should return a proper Globus success code
- }
-
- sub cache_cleanup
- {
- my $self = shift;
- my $description = $self->{JobDescription};
-
- if ( ${self}->{individual_condor_log} ) {
- $self->log("Deleting Condor user log");
- $self->log_to_gratia();
- unlink($self->{condor_logfile});
- }
-
- return $self->SUPER::cache_cleanup($self);
- }
-
-

GRAM5 made a lot of patches to the SCRIPT_FILE section:

324,404c355,363
+ my $shouldtransferfiles = $description->shouldtransferfiles();
+ if (defined($shouldtransferfiles))
+ {
+ $self->log("Adding \"should_transfer_files = $shouldtransferfiles\"\n");
+ print SCRIPT_FILE "should_transfer_files = $shouldtransferfiles\n";
+ }
+
+ my $WhenToTransferOutput = $description->whentotransferoutput();
+ if (defined($WhenToTransferOutput))
+ {
+ $self->log("Adding \"WhenToTransferOutput = $WhenToTransferOutput\"\n");
+ print SCRIPT_FILE "WhenToTransferOutput = $WhenToTransferOutput\n";
+ }
+
+ my $transfer_input_files = $description->transferinputfiles();
+ if (defined($transfer_input_files))
+ {
+ $self->log("Adding explicitly \"transfer_input_files = "
+ ."$transfer_input_files\"\n");
+ print SCRIPT_FILE "transfer_input_files = $transfer_input_files\n";
+ }
+ else
+ {
+ my @transfer_input_files = $description->transferinputfiles();
+ if (defined($transfer_input_files[0]))
+ {
+ my $file_list_string = "";
+ foreach my $file (@transfer_input_files)
+ {
+ $file_list_string .= "$file, ";
+ }
+ $file_list_string ~ s/, $//;
+ $self->log("Adding \"transfer_input_files = $file_list_string\"\n");
+ print SCRIPT_FILE "transfer_input_files = $file_list_string\n";
+ }
+ }
+
+ my $transfer_output_files = $description->transferoutputfiles();
+ if (defined($transfer_output_files))
+ {
+ $self->log("Adding explicitly \"transfer_output_files = "
+ ."$transfer_output_files\"\n");
+ print SCRIPT_FILE "transfer_output_files = $transfer_output_files\n";
+ }
+ else
+ {
+ my @transfer_output_files = $description->transferoutputfiles();
+ if (defined($transfer_output_files[0]))
+ {
+ my $file_list_string = "";
+ foreach my $file (@transfer_output_files)
+ {
+ $file_list_string . "$file, ";
+ }
+ $file_list_string =~ s/, $//;
+ $self->log("Adding \"transfer_output_files = "
+ ."$file_list_string\"\n");
+ print SCRIPT_FILE "transfer_output_files = $file_list_string\n";
+ }
+ }
+
+ if ($universe eq 'parallel')
+ {
+ print SCRIPT_FILE "Output = " . $description->stdout() . "\n";
+ print SCRIPT_FILE "Output = " . $description->stderr() . "\n";
+ print SCRIPT_FILE "machine_count = " . $description->count() . "\n";
+ print SCRIPT_FILE "queue\n";
+ }
+ else
+ {
+ for (my $i = 0; $i < $description->count(); $i++) {
+ if ($multi_output) {
+ print SCRIPT_FILE "Output = " .
+ $self->{STDIO_MERGER}->add_file('out') . "\n";
+ print SCRIPT_FILE "Error = " .
+ $self->{STDIO_MERGER}->add_file('err') . "\n";
+ } else {
+ print SCRIPT_FILE "Output = " . $description->stdout() . "\n";
+ print SCRIPT_FILE "Error = " . $description->stderr() . "\n";
+ }
+ print SCRIPT_FILE "queue 1\n";
---
- for (my $i = 0; $i < $description->count(); $i++) {
- if ($multi_output) {
- print SCRIPT_FILE "Output = " .
- $self->{STDIO_MERGER}->add_file('out') . "\n";
- print SCRIPT_FILE "Error = " .
- $self->{STDIO_MERGER}->add_file('err') . "\n";
- } else {
- print SCRIPT_FILE "Output = " . $description->stdout() . "\n";
- print SCRIPT_FILE "Error = " . $description->stderr() . "\n";
405a365
- print SCRIPT_FILE "queue 1\n";
435c395
+ $failure_text = join("", @response_text);
---
- $failure_text = join(//, @response_text);

B) NFS-lite changes made to the condor.pm file:

This is in terms of "diff condornfslite.pm OSG/VDTcondor.pm"

The following are changes to initiate NFS-lite:

134,137d127
+ my $isNFSLite = 1; # Flag to tell if we are using NFS lite. 1 or true for yes
+ my $scratch_isset = 0; # Flag if the SCRATCH_DIRECTORY environment variable is set indicating likely GRAM job
+
+
226,232c216,236
+
+ # NFS lite start
+ if ($isNFSLite) {
+
+ my $osg_grid = '';
+ my $use_osg_grid = 1;
+
---
- ##### OSG-Specific modification #####
- ##### These should not affect any non-OSG site, #####
- ##### unless you define $OSG_GRID #####
-
- # First, we figure out if this is an OSG installation, and if so, where
- # OSG is installed on the worker nodes
- my $osg_grid = '';
- my $use_osg_grid = 1;
- my $use_dynamic_wn_tmp = 1;
- map {
- if ($_->[0] eq "OSG_GRID") {
- $osg_grid = $_->[1];
- } elsif ($_->[0] eq "OSG_DONT_USE_OSG_GRID_FOR_GL") {
- $use_osg_grid = 0;
- }
-
- } @environment;
-
- # If this is an OSG installation, we set GLOBUS_LOCATION based on OSG_GRID,
- # and we set OSG_WN_TMP based on _CONDOR_SCRATCH_DIR
- if ($osg_grid ne '') {
234,245c238,239
+ if ($_->[0] eq "OSG_GRID") {
+ $osg_grid = $_->[1];
+ } elsif ($_->[0] eq "OSG_DONT_USE_OSG_GRID_FOR_GL") {
+ $use_osg_grid = 0;
+ } elsif ($_->[0] eq "LOGNAME") {
+ $logname = $_->[1];
+ } elsif ($_->[0] eq "SCRATCH_DIRECTORY") {
+ $scratch_isset = 1;
+ $scratch_directory = $_->[1];
+ $_->[1] = '$_CONDOR_SCRATCH_DIR';
+ } elsif ($_->[0] eq "X509_USER_PROXY") {
+ $_->[0] = "CHANGED_X509";
---
- if ($use_osg_grid && $_->[0] eq "GLOBUS_LOCATION") {
- $_->[1] = $osg_grid . "/globus";
248,271d241
+
+ # If this is an OSG installation, we set GLOBUS_LOCATION based on OSG_GRID
+ if ($osg_grid ne '') {
+ map {
+ if ($use_osg_grid && $_->[0] eq "GLOBUS_LOCATION") {
+ $_->[1] = $osg_grid . "/globus";
+ }
+ } @environment;
+ }
+
+ if ($scratch_isset) {
+ # Remote_InitialDir apparently suppresses the setting of the SCRATCH_DIRECTORY env variable
+ push(@environment,["MY_INITIAL_DIR",'$_CONDOR_SCRATCH_DIR']);
+ }
+ elsif ( $description->directory() =~ m/.+$logname/xms ) {
+ # If the directory ends with the logname it might be a globus-job-run job
+ # so take control of the initial_dir
+ push(@environment,["MY_INITIAL_DIR",'$_CONDOR_SCRATCH_DIR'] );
+ }
+ else {
+ # assume that remote_initialdir is set and the submitter knows what they are
+ # doing.
+ push(@environment,["MY_INITIAL_DIR",$description->directory()] );
+ }
273c243
+ # NFS Lite End
---
- ##### End OSG-Specific modification #####
386,417c351
+ # NFS Lite mode
+ if ($isNFSLite) {
+ print SCRIPT_FILE "should_transfer_files = YES\n";
+ print SCRIPT_FILE "when_to_transfer_output = ON_EXIT\n";
+ print SCRIPT_FILE "transfer_output = true\n";
+ # GRAM Files to transfer to the worker node
+ # Only do this if we are dealing with a GRAM job that has set up a scratch area
+ # otherwise we assume it is a globus-job-run or the users is using remote_initialdir
+ if ( $scratch_isset && !( $self->isWSGramGlobus() ) ) {
+ if ( $self->isWSGramCondorG() ) {
+ $scratch_directory = $description->directory();
+ }
+ my $sdir;
+ opendir($sdir,$scratch_directory);
+ my @sfiles = grep { !/^\./} readdir($sdir);
+ close $sdir;
+
+ print SCRIPT_FILE "transfer_input_files = ";
+ SFILE:
+ foreach $f ( @sfiles ) {
+ $f =~ s{\/\/}{\/}g;
+ $f = $scratch_directory . "/" . $f;
+ next SFILE if $f eq $description->executable();
+ next SFILE if $f eq $description->stdin();
+ next SFILE if $f eq $description->stdout();
+ push (@flist,"$f");
+ }
+ print SCRIPT_FILE join(",",@flist);
+ print SCRIPT_FILE "\n";
+ }
+ }
+ # End NFS Lite Mode
740,764d673
+ sub isWSGramCondorG {
+ my $self = shift;
+ my $description = $self->{JobDescription};
+ my $jobcredentialendpoint = "";
+ $jobcredentialendpoint = $description->jobcredentialendpoint();
+ if ( !($self->isWSGramGlobus() ) && ($jobcredentialendpoint ne "") ) {
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+
+ sub isWSGramGlobus {
+ my $self = shift;
+ my $description = $self->{JobDescription};
+ my $extensions = $description->extensions();
+ if ($extensions =~ /globusrun/) {
+ return 1;
+ }
+ else {
+ return 0;
+ }
+ }
+


C) UCSD Specific changes made to the condor.pm file:

This is in terms of "diff UCSDcondor.pm condornfslite.pm"

The UCSD condor pool is configured with a wrapper so extra parameters are required:

259a260
+ $wrapper_arguments .= " -wrapper_iwd " . ' $_CONDOR_SCRATCH_DIR'; # UCSD Mod
264a266
+ $wrapper_arguments .= " -wrapper_iwd " . ' $_CONDOR_SCRATCH_DIR'; # UCSD Mod
269a272
+ $wrapper_arguments .= " -wrapper_iwd " . $description->directory(); # UCSD Mod

UCSD Modification for accounting:

274a278,293
+ # START UCSD Mods
+ # Setup for groups by matching the logname to the appropriate condor group
+ # First set a default
+ $AccountingGroup = "group_other." . $logname;
+ # Then override if necessary
+ if ($logname =~ /.*cms.*/) { $AccountingGroup = "group_cms." . $logname; }
+ if ($logname =~ /.*cmspa.*/) { $AccountingGroup = "group_cmspa." . $logname; }
+ if ($logname =~ /.*cmsprod.*/) { $AccountingGroup = "group_cmsprod." . $logname; }
+ if ($logname =~ /.*cdf.*/) { $AccountingGroup = "group_cdf." . $logname; }
+ if ($logname =~ /.*harp.*/) { $AccountingGroup = "group_harp." . $logname; }
+ if ($logname =~ /.*caf.*/) { $AccountingGroup = "group_cdf." . $logname; }
+ if ($logname =~ /.*samgrid.*/) { $AccountingGroup = "group_samgrid." . $logname; }
+ if ($logname =~ /.*sbgrid.*/) { $AccountingGroup = "group_sbgrid." . $logname; }
+ if ($logname =~ /.*glowhtpc.*/) { $AccountingGroup = "group_glowhtpc." . $logname; }
+ # STOP UCSD Modification
+

UCSD defines the architecture and operating system:

363,364c382
- $requirements .= " && " if($requirements);
- $requirements .= "Memory >= " . $description->min_memory();
---
+ $requirements .= " && Memory >= " . $description->min_memory();
368,369c386,387
- print SCRIPT_FILE "Requirements = $requirements\n";
-
---
+ #print SCRIPT_FILE "Requirements = $requirements\n";
+ print SCRIPT_FILE "Requirements = OpSys? = \"LINUX\" && (Arch \"X86_64\" || Arch = \"INTEL\")\n"; # UCSD Mod

UCSD Script File modifications:

380c398,400
- print SCRIPT_FILE "Arguments = $argument_string\n";
---
+ # START UCSD Modification
+ print SCRIPT_FILE "Arguments = $argument_string $wrapper_arguments\n"; # UCSD added wrapper args
+ # END UCSD Modification
385c405,406
- print SCRIPT_FILE "+GratiaJobOrigin=\"GRAM\"\n";
---
+ $description->save("/var/tmp/description-". $description->uniq_id() . ".desc");
+ print SCRIPT_FILE "+AccountingGroup = \"$AccountingGroup\"\n"; # UCSD

UCSD tests to see if there is a scratch directory:

390a412,420
+
+ # Lets test to make sure the scratch directory exists> if ( -d "$scratch_directory" ) {
+ $self->logMe("$scratch_directory found!");
+ }
+ else {
+ $self->logMe("$scratch_directory NOT found!");
+ }
+
393a424
+ $self->logMe($scratch_directory);
395c426,427
- if ( $self->isWSGramCondorG() ) {
---
+ $self->logMe("Passed the test");
+ if ( $self->isWSGramCondorG() ) {
396a429
+ $self->logMe($scratch_directory);
398a432
+ $self->logMe($scratch_directory);
400a435
+ $self->logMe(@sfiles);

UCSD Mod:

419a455,461
+ # UCSD Mode, placed here to over ride the attempt by users to override the following
+ # 36 hours
+ print SCRIPT_FILE "maxRunTime = 129600\n";
+ # Two Weeks
+ print SCRIPT_FILE "maxQTime = 259200\n";
+ print SCRIPT_FILE "periodic_remove = (RemoteWallClockTime? > \$(maxRunTime)) || ((QDate - CurrentTime? ) > \$(maxQTime))\n";
+ # UCSD Mode Ends

UCSD Mod to save the script made:

435a478,483
+ ### UCSD MOD ###
+ # Save the script we make
+ my $tmpfname = "/var/tmp/script-" . $description->uniq_id() . ".script";
+ system("/bin/cp $script_filename $tmpfname");
+ ### END UCSD MOD ###
+

UCSD changes to VDT patches:

690a742
+ my $condor_version_number = 0;
692a745,752
+ unless ($condor_version_number) {
+ my $condor_version_string = `condor_version 2>/dev/null`;
+ $condor_version_number =
+ join("",
+ map { m&^(\d+)&?sprintf("%03d", $1):"000" }
+ ($condor_version_string =~ m!^\$CondorVersion: ([^\.]+)\.([^\.]+)\.([^\.]+)!s));
+ return 1 unless ($condor_version_number < 6009000);
+ }
725,739d784
- sub cache_cleanup
- {
- my $self = shift;
- my $description = $self->{JobDescription};
-
- if ( ${self}->{individual_condor_log} ) {
- $self->log("Deleting Condor user log");
- $self->log_to_gratia();
- unlink($self->{condor_logfile});
- }
-
- return $self->SUPER::cache_cleanup($self);
- }
-
-
742a788
+ $self->logMe("Inside isWSGramCondorG");
744a791
+ $self->logMe("Got Job Credential $jobcredentialendpoint");
745a793
+ $self->logMe("Inside isWSGramCondorG test true");
748a797
+ $self->logMe("Failed to get job credential");
752a802,812
+ sub logMe {
+ my $self = shift;
+ my $description = $self->{JobDescription};
+ my $entry = shift;
+ my $unid = $description->uniq_id();
+ open(LOG,">>/var/tmp/logme-$unid.log") || die "Could not open file /var/tmp/logme-$unid.log:$!\n";
+ print LOG $entry;
+ print LOG "\n";
+ close LOG;
+ }
+
755a816
+ $self->logMe("Inside isWSGramGlobus");
756a818
+ $self->logMe("Extensions $extensions");
757a820
+ $self->logMe("Yes WSGramGlobus? ");
760a824
+ $self->logMe("No WSGramGlobus? ");


-- ChristopherTheissen - 2009/12/14

Edit | Attach | Print version | History: r6 < r5 < r4 < r3 < r2 | Backlinks | Raw View | Raw edit | More topic actions...
Topic revision: r4 - 2009/12/14 - 22:15:58 - ChristopherTheissen
 
This site is powered by the TWiki collaboration platformCopyright © by the contributing authors. All material on this collaboration platform is the property of the contributing authors.
Ideas, requests, problems regarding TWiki? Send feedback