FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
get-projects.php
1 #!/usr/bin/php
2 <?php
3 /***********************************************************
4  get-projects.php
5  Copyright (C) 2007 Hewlett-Packard Development Company, L.P.
6 
7  This program is free software; you can redistribute it and/or
8  modify it under the terms of the GNU General Public License
9  version 2 as published by the Free Software Foundation.
10 
11  This program is distributed in the hope that it will be useful,
12  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  GNU General Public License for more details.
15 
16  You should have received a copy of the GNU General Public License along
17  with this program; if not, write to the Free Software Foundation, Inc.,
18  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19  ***********************************************************/
20 
54 /*
55  * Defects:
56  * 1. if you don't pass in any parameters, weirdnes... need to check
57  * for that case.
58  */
59 // pathinclude below is dependent on having fossology installed.
60 require_once "FIXMETOBERELATIVE/pathinclude.php"; // brings in global $PROJECTSTATEDIR +
61 global $LIBDIR;
62 global $INCLUDEDIR;
63 require_once("$LIBDIR/lib_projxml.h.php");
64 require_once("$INCLUDEDIR/fm-paths.php");
65 
66 $usage = <<< USAGE
67 Usage: get-projects [-h] -f <file>
68  Where <file> is an uncompressed XML file, fully qualified
69  -h displays this usage.
70 
71 USAGE;
72 
73 $XML_input_file = NULL;
74 
75 for ($i = 1; $i < $argc; $i++) {
76  switch ($argv[$i]) {
77  case '-f':
78  $i++;
79  if (isset($argv[$i])) {
80  $XML_input_file = $argv[$i];
81  }
82  else {
83  die("ERROR: Must specify an uncompressed filename after -f");
84  }
85  break;
86  case '-h':
87  echo $usage;
88  exit(0);
89  break;
90  default:
91  die("ERROR: Unknown argument: $argv[$i]\n$usage");
92  break;
93  }
94 }
95 
96 // convention is to put the trailing / at the end of the dir so everyone else
97 // doesn't have to worry about it.
98 // FIX THIS: need to have env file created by install process.
99 
100 // set the destination directory, use /tmp if none supplied
101 if (empty($FMDIR))
102 {
103  $dest_dir = '/tmp/';
104 }
105 else
106 {
107  $dest_dir = $FMDIR; // from fm-paths.php in /usr/local/include
108 }
109 // create output directory with the date as part of the name.
110 
111 $yyyymmdd = date('Y-m-d');
112 $golden = '/golden.' . "$yyyymmdd" . '/';
113 $dest_dir .= $golden;
114 $wget_logs = $dest_dir . 'wget-logs/';
115 $log_data = $dest_dir . 'Logs-Data/';
116 $input_files = $dest_dir . 'Input-files/';
117 
118 // Create output directories. They should not exist
119 if (! is_dir("$dest_dir")){
120  exec("mkdir -p $dest_dir", $dummy, $rval);
121  if ($rval != 0) {
122  echo "ERROR: can't create output directory: $dest_dir";
123  exit(1);
124  }
125 }
126 if (! is_dir($wget_logs)){
127  exec("mkdir -p $wget_logs", $dummy, $rval);
128  if ($rval != 0) {
129  echo "ERROR: can't create output directory: $wget_logs\n";
130  exit(1);
131  }
132 }
133 if (! is_dir($log_data)){
134  exec("mkdir -p $log_data", $dummy, $rval);
135  if ($rval != 0) {
136  echo "ERROR: can't create output directory: $log_data\n";
137  exit(1);
138  }
139 }
140 if (! is_dir($input_files)){
141  exec("mkdir -p $input_files", $dummy, $rval);
142  if ($rval != 0) {
143  echo "ERROR: can't create output directory: $input_files\n";
144  exit(1);
145  }
146 }
147 
148 // make sure we have some sort of valid input (e.g. gp -f)
149 if (is_null($XML_input_file)){
150  echo "Error: null input file\n";
151  echo $usage;
152  exit(1);
153 }
154 
155 // simplexml.... can't deal with a compressed file, make sure it's not.
156 // possible enhancement is to uncompress the file if passed one....
157 // Note that the code below still may not catch all of them due to no
158 // standard naming convention.
159 $last = strrchr($XML_input_file, ".");
160 switch($last ) {
161  case '.gz':
162  echo $usage;
163  exit(1);
164  break;
165  case '.bz2':
166  echo $usage;
167  exit(1);
168  break;
169  case '.zip':
170  echo $usage;
171  exit(1);
172  break;
173 }
174 
175 echo "Processing Xml file $XML_input_file\n";
176 
177 // parse the xml file and build the data structure. read_pfile returns
178 // the data struncture sorted (asending).
179 $fm_projects = array();
180 $fm_projects = read_pfile($XML_input_file);
181 
182 // Look for projects without any of the 3 archives. Log any found into
183 // skipped_fmprojects file and remove it from the fm_projects array.
184 
185 $projects_skipped = 0;
186 foreach($fm_projects as $rank => $key){
187  foreach ($key as $name => $values){
188  list(
189  $url_tgz,
190  $url_bz2,
191  $url_zip,
192  $homepage,
193  $short_desc,
194  $release_version,
195  $release_version_id,
196  $release_version_date
197  ) = $values;
198  # echo "We got:NAME:$name\nTG:$url_tgz\nBZ:$url_bz2\nZ:$url_zip\nHM:$homepage\nDesc:$short_desc\nRV:$release_version\nVID:$release_version_id\nVD:$release_version_date\n\n";
199 
200  }
201  if (($url_tgz == "") and ($url_bz2 == "") and ($url_zip == "")) {
202  $NoUrls = fopen("{$log_data}skipped_fmprojects", 'w') or
203  die("Can't open: $php_errormsg");
204  if (-1 ==
205  fwrite($NoUrls, "$rank $name $homepage $release_version\n")){
206  die("Can't write: $php_errormsg");
207  }
208  $projects_skipped++;
209  unset($fm_projects["$rank"]);
210  fclose($NoUrls);
211  }
212 }
213 
214 /*
215  * At this point the array should only have the
216  * (fm_projects - skipped projects). The working list will have AT LEAST
217  * 1 archive. Go get it.
218  * wget_url is called synchonisly(sp) since we only get 1 package and
219  * we need to know if what wget return status and what it got us.
220  */
221 
222 $skipped_uploads = array();
223 $uploads = array();
224 $mode = 's';
225 $uploads_scheduled = 0;
226 foreach ($fm_projects as $pkg_rank => $nkey){
227  foreach ($nkey as $pkg_name => $pkg_data){
228  // unpack the data so the code is easier to read
229 
230  list(
231  $tgz_url,
232  $bz2_url,
233  $zip_url,
234  $homepg,
235  $short_desc,
236  $ver,
237  $ver_id,
238  $ver_date
239  ) = $pkg_data;
240 
241  // Repackage the common data needed by all archives and wget_url
242  $common_data = array (
243  $short_desc,
244  $ver,
245  $ver_id,
246  $ver_date
247  );
248  // Set up the mode for wget_url
249  $gzip = '.gz';
250  $bzip2 = '.bz2';
251  $zip1 = '.zip';
252 
253  // Select the archives in the following order: .gz, .bz2, .zip
254  // There should be at least one of them.
255  echo "Trying project #$pkg_rank $pkg_name at:\n";
256  if ($tgz_url != "") {
257  $cnt = array_unshift($common_data,$tgz_url);
258  $tupload = wget_url($pkg_rank, $pkg_name, $gzip, $common_data, $mode);
259  }
260  elseif ($bz2_url != "") {
261  $cnt = array_unshift($common_data,$bz2_url);
262  $tupload = wget_url($pkg_rank, $pkg_name, $bzip2, $common_data, $mode);
263  }
264  elseif ($zip_url != "") {
265  $cnt = array_unshift($common_data,$zip_url);
266  $tupload = wget_url($pkg_rank, $pkg_name, $zip1, $common_data, $mode);
267  }
268  if(is_null($tupload['Null'])){
269  echo "Warning! There may have been an undetected error in the wget of $pkg_name\n";
270  echo "Check the wget logs in $wget_logs\n";
271  }
272  if(!(is_null($tupload['Compressed']))){
273  $uploads[] = $tupload['Compressed'];
274  $uploads_scheduled++;
275  echo "#$pkg_rank $pkg_name was downloaded and can be scheduled for an upload\n";
276  }
277  elseif(!(is_null($tupload['Uncompressed']))){
278  echo "WARNING! did not get a compressed archive from wget\n";
279  echo "Will Not upload $pkg_name\n";
280  $skipped_uploads[] = $tupload['Uncompressed'];
281  echo "\n-----\n"; // eye-candy, seperates packages in the output
282  continue;
283  }
284  echo "\n-----\n";
285  }
286 }
287 
288 // save the skipped uploads in a file (if any)
289 
290 $skipped_up = count($skipped_uploads);
291 if ($skipped_up != 0){
292  echo "Saving skipped uploads (downloaded files that were not compressed)\n";
293  echo
294 "There were $skipped_up skipped uploads, see $log_data/skipped_uploads for details\n";
295 
296  $SUP = fopen("$log_data/skipped_uploads", 'w')
297  or die("Can't open $log_data/skipped_uploads, $php_errormsg\n");
298  foreach($skipped_uploads as $skipped){
299  fwrite($SUP, "$skipped\n")
300  or die("Can't write to $log_data/skipped_uploads, $php_errormsg\n");
301  }
302  fclose($SUP);
303 }
304 
305 // at this point we have done the wgets and made a list of all the ones
306 // that succeeded. Now process that list into an input file for cp2foss
307 // as cp2foss will do the actual upload.
308 
309 create_cp2foss_ifile($uploads, "{$input_files}Freshmeat_to_Upload");
310 
311 /* Report results */
312 report($log_data);
313 
314 // end of Main....
315 
331 function create_cp2foss_ifile($uploads, $filename){
332 
333  $UPLOAD = fopen($filename, 'w') or
334  die("ERROR: can't open $filename, $php_errormsg\n");
335  $upload_count = count($uploads);
336  for ($uc=0; $uc<$upload_count; $uc++){
337  $parms = parse_fm_input($uploads[$uc]);
338 
339  list (
340  $rank,
341  $name,
342  $archive_path,
343  $description,
344  $version,
345  $version_id,
346  $version_date
347  ) = $parms;
348 
349  // don't write an entry that has no archive path (wget either returned
350  // an error or a file that was not a compressed archive).
351  if(!(isset($archive_path))){
352  continue;
353  }
354  //dbg("CCP2iF:R:$rank N:$name\nA:$archive_path\nD:$description V:$version, VID:$version_id $VD:$version_date\n");
355  $folder_path = '-p Freshmeat';
356  $alpha = '-A';
357  $name = "-n '$name-$version'";
358  // For now we are going to put the -A at the end to work around a defect in cp2foss.
359  $cp2foss_input = "$folder_path $name -a $archive_path -d '$description' $alpha\n";
360  //pdbg("Would write the following to the file:", $cp2foss_input);
361  fwrite($UPLOAD, $cp2foss_input) or
362  die("Errors: can't write $php_error_msg\n");
363  }
364  fclose($UPLOAD);
365  return;
366 }
378 function report($output_dir){
379 
380  global $projects_skipped;
381  global $uploads_scheduled;
382  global $input_files;
383 
384  $skipped_path = "{$output_dir}skipped_fmprojects";
385 
386  if ($uploads_scheduled){
387  printf("There were %d projects scheduled for uploading\nSee the {$input_files}Freshmeat_to_Upload\nfile for details\n\n", $uploads_scheduled);
388  }
389  // this doesn't make sense, fix later...
390  else{
391  printf("There were %d projects downloaded\nSee the $output_dir for details\n\n", $uploads_scheduled);
392  }
393  if ($projects_skipped != 0){
394  printf(
395  "There were %d skipped projects for this run\nSee the {$output_dir}skipped_fmprojects file for details\n", $projects_skipped);
396  }
397  else{
398  printf("There were %d skipped projects for this run\n", $projects_skipped);
399  echo ("Skipped projects are projects that had no compressed downloadable archives\n");
400  }
401  echo "To upload the files into the data-base run cp2foss using the Freshmeat_to_Upload file\n";
402  return;
403 }
404 
405 
406 
424 function wget_url($project_rank, $project_name, $ark_type, $proj_data, $mode){
425 
426  // NOTE: quite a few of the urls that are supposed to point to an archive
427  // really end up just depositing a file in various forms:
428  // *.html, *.cgi showfiles.php?xxxxxx, etc....
429  //
430  global $wget_logs;
431  global $log_data;
432  global $dest_dir;
433 
434  list($url,
435  $short_desc,
436  $ver,
437  $ver_id,
438  $ver_date
439  ) = $proj_data;
440 
441  $log_path = "$wget_logs" . "log.$project_name-" . "$project_rank";
442 
443  $wCmd .= "$proxy" . "wget -P $dest_dir -o $log_path $url ";
444 
445  if ($mode == 'a'){
446  echo "$url\n";
447  $wCmd .= ' &';
448  $lastline = system("$wCmd", $retval);
449  }
450 
451  if ($mode == 's'){
452  echo "$url\n";
453  // set these to null, so the caller knows which one got set.
454  $upload['Compressed'] = NULL;
455  $upload['Null'] = NULL;
456  $upload['Uncompressed'] = NULL;
457  exec("$wCmd", $dummy, $retval);
458  if ($retval != 0){
459  $WGF = fopen ("{$log_data}failed-wgets", 'a') or
460  die("Can't open: $php_errormsg\n");
461  if (-1 == fwrite($WGF, "$project_rank $project_name $url\n")) {
462  die("Can't write: $php_errormsg");
463  }
464  }
465  // wget can return a 0 (zero) exit status with 404 type errors, see
466  // _getfmpath below. So we check here if $archive_path is null
467  // if null, it's a failed wget, return null to indicate that.
468  //
469  elseif ($retval == 0){
470  $archive_path = _getfmpath($log_path);
471  if (is_null($archive_path)){
472  echo "Warning! returning NULL for an archive path\n";
473  return($upload);
474  }
475  // wget appears to have worked, now what type of file got downloaded?
476  // For now we will only process compressed archives, the rest of
477  // the files are usually a download of their front page, which
478  // is useless to upload.
479  $type = exec("file -b $archive_path", $dummy , $ret_val);
480  if (ereg('compressed data', $type)){
481  $upload['Compressed'] =
482  "'$project_rank' '$project_name' '$archive_path' '$short_desc' '$ver' '$ver_id' '$ver_date'";
483  $upload['Null'] = True;
484  }
485  else{
486  $upload['Uncompressed'] = "'$project_name' '$archive_path'";
487  $upload['Null'] = True;
488  }
489  }
490  }
491  // close the file? (Suceeded and WGF), or is it faster to leave open?
492  return($upload);
493 }
511 function _getfmpath($path){
512 
513  // The Freshmeat rdf uses a fake url and archive name so we need to get
514  // the path name of the downloaded archive by looking in the wget
515  // log file.
516 
517  $path_wanted = NULL;
518  $contents = file($path);
519  $size = count($contents);
520  $stat_line = $contents[$size-2];
521  if(ereg('^Removed ',$stat_line)){
522  // adjust for a different case, if wget downloads a .listing file
523  // it adjusts it be an index.html file instead.
524  $stat_line = $contents[$size-1];
525  }
526  //pdbg("_GFMP: Stat line is:\n$stat_line");
527  // We shouldn't find errors like this in the file, wget is supposed to
528  // have returned with 0 status.
529  if (ereg('ERROR 404:', $stat_line)){
530  echo "ERROR 404 found in file $dir_entry\n";
531  echo "Line was:\n$stat_line\n";
532  return($path_wanted);
533  }
534  elseif (ereg('ERROR 502:', $stat_line)){
535  echo "ERROR 502 found in file $dir_entry\n$stat_line\n";
536  echo "Line was:\n$stat_line\n";
537  return($path_wanted);
538  }
539  elseif (ereg('ERROR 503:', $stat_line)){
540  echo "ERROR 503 found in file $dir_entry\n$stat_line\n";
541  echo "Line was:\n$stat_line\n";
542  return($path_wanted);
543  }
544  elseif (ereg('ERROR 400:', $stat_line)){
545  echo "ERROR 400 found in file $dir_entry\n$stat_line\n";
546  echo "Line was:\n$stat_line\n";
547  return($path_wanted);
548  }
549  elseif (ereg('--no-check-certificate', $stat_line)){
550  echo
551  "ERROR Secure connect to sourceforge.net needed: in file $dir_entry\n";
552  echo "Line was:\n$stat_line\n";
553  return($path_wanted);
554  }
555 
556 
557  $chunks = explode(' ', $stat_line);
558  //pdbg("_GFMP: Path Wanted:\n{$chunks[4]}");
559  // Strip the ` off the front
560  $stmp = ltrim($chunks[4], '`');
561  //pdbg("_GFMP: stmp:$stmp");
562  $path_wanted = rtrim($stmp, '\'');
563  //pdbg("_GFMP: path_wanted:$path_wanted");
564 
565  return($path_wanted);
566 }
567 
568 ?>
#define ERROR(...)
Definition: logging.h:90
if(!preg_match("/\s$projectGroup\s/", $groups)&&(posix_getgid()!=$gInfo['gid']))
get monk license list of one specified uploadtree_id
Definition: migratetest.php:44
Usage()
Print Usage statement.
Definition: fo_dbcheck.php:75
list_t type structure used to keep various lists. (e.g. there are multiple lists).
Definition: nomos.h:321
FUNCTION void usage(char *name)
Definition: usage.c:30