Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test only files that match a regular expression #96

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@ Usage: fdupes [options] DIRECTORY...
option will change this behavior
-n --noempty exclude zero-length files from consideration
-A --nohidden exclude hidden files from consideration
-g --regex=pattern include only files matching a Posix.2 extended
regular expressions pattern. The pattern is matched
against the full file path when reaching that file,
not against the directories recursed through to reach
it. As a custom extension, start a pattern with '//'
to match the remainder of that pattern against the
filename only (last part of path)
-G --reGex=pattern case-insensitive regex, further as above
-f --omitfirst omit the first file in each set of matches
-1 --sameline list each set of matches on a single line
-S --size show size of duplicate files
Expand Down
10 changes: 10 additions & 0 deletions fdupes.1
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,16 @@ omit the first file in each set of matches
.B -A --nohidden
exclude hidden files from consideration
.TP
.B -g --regex=pattern
test only files matching a Posix.2 extended regular expressions pattern.
The pattern is matched against the full file path when reaching that file,
not against the directories while recursing through them. As a custom extension,
start a pattern with '//' to match the remainder of that pattern against the
filename only (last part of path).
.TP
.B -G --reGex=pattern
case-insensitive regex, further as above
.TP
.B -1 --sameline
list each set of matches on a single line
.TP
Expand Down
91 changes: 90 additions & 1 deletion fdupes.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#include <string.h>
#include <errno.h>
#include <libgen.h>
#include <regex.h> //NOTE: _POSIX_C_SOURCE defined in /usr/include/features.h included in /usr/include/stdio.h, so we use Posix.2 regex

#include "md5/md5.h"

Expand All @@ -54,6 +55,9 @@
#define F_PERMISSIONS 0x2000
#define F_REVERSE 0x4000
#define F_IMMEDIATE 0x8000
#define F_REGEXPATH 0x00010000L
#define F_REGEXNAME 0x00020000L
#define F_REGEXICASE 0x00040000L

typedef enum {
ORDER_MTIME = 0,
Expand All @@ -65,6 +69,8 @@ char *program_name;

unsigned long flags = 0;

regex_t regexcompiled;

ordertype_t ordertype = ORDER_MTIME;

#define CHUNK_SIZE 8192
Expand Down Expand Up @@ -343,6 +349,41 @@ int grokdir(char *dir, file_t **filelistp)
free(newfile);
} else {
if (S_ISREG(linfo.st_mode) || (S_ISLNK(linfo.st_mode) && ISFLAG(flags, F_FOLLOWLINKS))) {
if (ISFLAG(flags, F_REGEXPATH)) {
int retval = regexec(&regexcompiled, newfile->d_name, 0, NULL, 0);
if (retval) { //no match or error
free(newfile->d_name);
free(newfile);
if (retval != REG_NOMATCH) {
char *errmsg;
int len = regerror(retval, &regexcompiled, NULL, 0);
errmsg = (char*)malloc(len);
regerror(retval, &regexcompiled, errmsg, len);
errormsg("Regex match failed: %s\n", errmsg);
exit(1);
}
continue;
}
}
else if (ISFLAG(flags, F_REGEXNAME)) {
fullname = strdup(newfile->d_name);
name = basename(fullname);
int retval = regexec(&regexcompiled, name, 0, NULL, 0);
free(fullname);
if (retval) { //no match or error
free(newfile->d_name);
free(newfile);
if (retval != REG_NOMATCH) {
char *errmsg;
int len = regerror(retval, &regexcompiled, NULL, 0);
errmsg = (char*)malloc(len);
regerror(retval, &regexcompiled, errmsg, len);
errormsg("Regex match failed: %s\n", errmsg);
exit(1);
}
continue;
}
}
*filelistp = newfile;
filecount++;
} else {
Expand Down Expand Up @@ -1040,6 +1081,14 @@ void help_text()
printf(" \toption will change this behavior\n");
printf(" -n --noempty \texclude zero-length files from consideration\n");
printf(" -A --nohidden \texclude hidden files from consideration\n");
printf(" -g --regex=pattern \tinclude only files matching a Posix.2 extended\n");
printf(" \tregular expressions pattern. The pattern is matched\n");
printf(" \tagainst the full file path when reaching that file,\n");
printf(" \tnot against the directories recursed through to reach\n");
printf(" \tit. As a custom extension, start a pattern with '//'\n");
printf(" \tto match the remainder of that pattern against the\n");
printf(" \tfilename only (last part of path)\n");
printf(" -G --reGex=pattern \tcase-insensitive regex, further as above\n");
printf(" -f --omitfirst \tomit the first file in each set of matches\n");
printf(" -1 --sameline \tlist each set of matches on a single line\n");
printf(" -S --size \tshow size of duplicate files\n");
Expand Down Expand Up @@ -1083,6 +1132,7 @@ int main(int argc, char **argv) {
int progress = 0;
char **oldargv;
int firstrecurse;
char *regex=NULL;

#ifndef OMIT_GETOPT_LONG
static struct option long_options[] =
Expand All @@ -1100,6 +1150,8 @@ int main(int argc, char **argv) {
{ "relink", 0, 0, 'l' },
{ "noempty", 0, 0, 'n' },
{ "nohidden", 0, 0, 'A' },
{ "regex", required_argument, NULL, 'g' },
{ "reGex", required_argument, NULL, 'G' },
{ "delete", 0, 0, 'd' },
{ "version", 0, 0, 'v' },
{ "help", 0, 0, 'h' },
Expand All @@ -1121,7 +1173,7 @@ int main(int argc, char **argv) {

oldargv = cloneargs(argc, argv);

while ((opt = GETOPT(argc, argv, "frRq1SsHlnAdvhNImpo:i"
while ((opt = GETOPT(argc, argv, "frRq1SsHlnAg:G:dvhNImpo:i"
#ifndef OMIT_GETOPT_LONG
, long_options, NULL
#endif
Expand Down Expand Up @@ -1157,6 +1209,18 @@ int main(int argc, char **argv) {
case 'A':
SETFLAG(flags, F_EXCLUDEHIDDEN);
break;
case 'G':
SETFLAG(flags, F_REGEXICASE);
//NO break;
case 'g':
if (strlen(optarg) >= 2 && optarg[0] == '/' && optarg[1] == '/') {
regex = optarg + 2;
SETFLAG(flags, F_REGEXNAME);
} else {
regex = optarg;
SETFLAG(flags, F_REGEXPATH);
}
break;
case 'd':
SETFLAG(flags, F_DELETEFILES);
break;
Expand Down Expand Up @@ -1205,6 +1269,27 @@ int main(int argc, char **argv) {
exit(1);
}

if (ISFLAG(flags, F_REGEXNAME) || ISFLAG(flags, F_REGEXPATH)) {
if (regex == NULL) {
errormsg("No regular expression parsed\n");
exit(1);
}
if (strlen(regex) == 0) {
errormsg("Empty regular expression parsed\n");
exit(1);
}
x = regcomp(&regexcompiled, regex, REG_EXTENDED | REG_NOSUB | (ISFLAG(flags, F_REGEXICASE) ? REG_ICASE : 0));
if (x) {
char *errmsg;
int len = regerror(x, &regexcompiled, NULL, 0);
errmsg = (char*)malloc(len);
regerror(x, &regexcompiled, errmsg, len);
errormsg("Could not compile regex %s : %s\n", regex, errmsg);
free(errmsg);
exit(1);
}
}

if (ISFLAG(flags, F_RECURSE) && ISFLAG(flags, F_RECURSEAFTER)) {
errormsg("options --recurse and --recurse: are not compatible\n");
exit(1);
Expand Down Expand Up @@ -1244,6 +1329,10 @@ int main(int argc, char **argv) {
if (!ISFLAG(flags, F_HIDEPROGRESS)) fprintf(stderr, "\r%40s\r", " ");
exit(0);
}

if (ISFLAG(flags, F_REGEXNAME) || ISFLAG(flags, F_REGEXPATH)) {
regfree(&regexcompiled);
}

curfile = files;

Expand Down