Gauche で CSV 形式のデータの重複するカラムのあるレコードを削除する

CSV ファイルのレコードに重複するカラムがある場合に、そのレコードを削除する Gauche スクリプト。
以下のコードでは *check-column-index* を定義して2番目のカラムでの重複をチェックしている。

(use text.csv)
(use gauche.collection)
(use srfi-13)

(define *check-column-index* 1)

(define (quick-sort comp lis)
  (if (null? lis)
      '()
      (let ((p (car lis)))
           (receive (a b) (partition (lambda (x) (comp x p)) (cdr lis))
                    (append (quick-sort comp a)
                            (cons p (quick-sort comp b)))))))

(define (quick-sort-csv comp lis column-index)
  (quick-sort (lambda (lis-x lis-y)
                      (let ((x (list-ref lis-x column-index))
                            (y (list-ref lis-y column-index)))
                           (if (comp x y) #t #f)))
              lis))

(define (delete-repeated-record record-list)
  (define (iter record-list prev-record-item result-list)
    (if (null? record-list)
        result-list
        (let ((record (car record-list)))
             (let ((current-record-item (list-ref record *check-column-index*)))
                  (iter (cdr record-list) current-record-item
                        (if (string=? current-record-item prev-record-item)
                            result-list
                            (cons record result-list)))))))
  (iter record-list "" '()))

(define (print-csv record-list)
  (if (null? record-list)
      '()
      (let ((record (car record-list)))
           (print "\"" (string-join record "\",\"") "\"")
           (print-csv (cdr record-list)))))

(define (read-csv file)
  (let ((reader (make-csv-reader #\,)))
       (call-with-input-file file
                             (lambda (in)
                                     (port->list reader in)))))

(define (main args)
  (if (null? (cadr args))
      (error "CSV file is required")
      (let ((sorted-csv-list (quick-sort-csv string> (read-csv (cadr args)) *check-column-index*)))
           (print-csv (delete-repeated-record sorted-csv-list))))
  0)

実行結果

$ cat test.csv
"上杉謙信","ken@foo.com","09023421011"
"織田信長","oda@hoge.com","08034528761"
"徳川家康","toku@yahoo.com","0230989124"
"上杉輝虎","ken@foo.com","09023421011"
"豊臣秀吉","saru@gmail.com","08013457812"
"長尾景虎","ken@foo.com","09023421011"
"松平元康","toku@yahoo.com","0230989124"
"木下藤吉郎","saru@gmail.com","08013457812"
"毛利元就","nari@yahoo.com","02056720981"
$ ./delete-repeated-csv-record.scm test.csv
"上杉謙信","ken@foo.com","09023421011"
"毛利元就","nari@yahoo.com","02056720981"
"織田信長","oda@hoge.com","08034528761"
"豊臣秀吉","saru@gmail.com","08013457812"
"徳川家康","toku@yahoo.com","0230989124"
プログラミングGauche
プログラミングGauche

posted with amazlet at 09.07.23
Kahuaプロジェクト
オライリージャパン
売り上げランキング: 51330
«
»