Privacy Policy Cookie Policy Terms and Conditions Benutzer:Redf0x/dabalyze - Wikipedia

Benutzer:Redf0x/dabalyze

aus Wikipedia, der freien Enzyklopädie

Dieses Perl-Skript stammt von en:User:Bo Lindbergh/dabalyze. Es dient dazu, Links auf Begriffsklärungsseiten in der deutschen Wikipedia zu finden. Dazu wird der Datenbank-Dump von http://dumps.wikimedia.org/ im XML-Format analysiert. Es funktioniert möglicherweise nicht fehlerfrei in einer non-Unix-Umgebung (lief hier aber unter Win2k fehlerfrei). Speichere es als "dabalyze" in einem passenden Verzeichnis. Weitergehende Informationen finden sich unten.


#! /usr/bin/perl

use strict;

my %interesting=
    ('' => {
        name            => 'article',
        filename        => 'articles.txt',
        cutoff          => 1},
     'Mod魥' => {
         name           => 'vorlage',
         filename       => 'templates.txt',
         cutoff         => 0,
         list           => 1});

my $exp_re=qr/\(Begriffsklärung\)$/;

my @templates=split(/\n/,<<__EOT__);
Begriffsklärung
__EOT__

foreach my $template (@templates) {
    $template =~ s/^([[:alpha:]])/[$1\L$1]/;
}

my $tmpl_re=join('|',reverse(sort(@templates)));

my $dab_re=qr/{{(?i:msg:)?\s*(?i:Vorlage\s*:\s*)?($tmpl_re)\s*}}/;

my($ns_re,%ns_canon);

my $want_progress=@ARGV>0 && $ARGV[0] eq '-p';
my $last_progress=-1;

sub pageloop (&)
{
    my($handler)=@_;
    my($size);
    local $/="</page>\x0A";

    $size=-s PAGES;
    while (defined(my $page=<PAGES>)) {
        my($nstitle,$ns,$title);

        $page =~ /^\s*<page>/ or last;
        ($nstitle)=($page =~ m{<title>([^<]+)</title>})
            or die "Kann Titel nicht finden für Seite";
        if ($nstitle =~ /^($ns_re):(.+)$/) {
            $ns=$1;
            $title=$2;
        } else {
            $ns='';
            $title=$nstitle;
        }
        $page =~ m{</text>} or next;
        substr($page,$-[0])='';
        $page =~ /<text xml:space="preserve">/
            or die "Kann Textanfang nicht finden für Seite $nstitle";
        substr($page,0,$+[0])='';
        $handler->($nstitle,$ns,$title,$page);
        if ($want_progress) {
            my $progress=int(tell(PAGES)/$size*1000);
            if ($progress!=$last_progress) {
                $last_progress=$progress;
                printf STDERR "\r0.%.3u",$progress;
            }
        }
    }
    if ($want_progress) {
        print STDERR "\r";
    }
}

sub mungtarget ($$$ )
{
    my(undef,$source,$sub)=@_;

    for my $target ($_[0]) {
        $target =~ tr/\t\n\r/   /;
        $target =~ s/^ +//;
        $target =~ s/ +$//;
        $target =~ s/ {2,}/ /g;
        if ($sub && $target =~ m{^/}) {
            $target=$source.$target;
        } elsif ($target =~ /^:*($ns_re) *: *(.+)$/i) {
            $target=$2;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
            $target=$ns_canon{lc($1)}.":".$target;
        } elsif ($target =~ /^:*(.+)$/i) {
            $target=$1;
            utf8::decode($target);
            $target=ucfirst($target);
            utf8::encode($target);
        } else {
            # a malformed link, usually empty brackets
        }
    }
}

my(%dab,%redir,@circular);

sub pass1 ()
{
    print STDERR "Analyse: 1. Durchgang\n";
    {
        my($siteinfo,@namespaces);
        local $/="</siteinfo>\x0A";

        $siteinfo=<PAGES>;
        @namespaces=
            $siteinfo =~ m{<namespace key="-?\d+">([^<]+)</namespace>}g;
        $ns_re=join('|',map(quotemeta($_),reverse(sort(@namespaces))));
        foreach my $ns (@namespaces) {
            $ns_canon{lc($ns)}=$ns;
        }
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);

        for my $text ($_[0]) {
            my $sub=$interesting{$ns}->{subpages};

            if ($ns eq '' && $text =~ $dab_re) {
                $dab{$nstitle}=1;
            }
            if ($text =~ /^#redirect.*\[\[([^\]\|]+)/i) {
                my($target,$back);

                $target=$1;
                mungtarget($target,$nstitle,$sub);
                while ($target ne $nstitle) {
                    my($newtarget);

                    $newtarget=$redir{$target};
                    last unless defined($newtarget);
                    $target=$newtarget;
                }
                if ($target eq $nstitle) {
                    push(@circular,$nstitle);
                } else {
                    $redir{$nstitle}=$target;
                }
            }
        }
    };
    foreach my $target (keys(%redir)) {
        my(@chain);

        for (;;) {
            my $newtarget=$redir{$target};
            last unless defined($newtarget);
            push(@chain,$target);
            $target=$newtarget;
        }
        pop(@chain);
        foreach my $source (@chain) {
            $redir{$source}=$target;
        }
    }

    print STDERR "    ".keys(%dab)." Begriffsklärungsseiten\n";
    print STDERR "\n";
}

my %stats=map {
    ($_,{});
} keys(%interesting);

my %lists=map {
    ($_,{});
} grep {
    $interesting{$_}->{list};
} keys(%interesting);

sub pass2 ()
{
    my(%linked);

    print STDERR "Analyse: 2. Durchgang\n";
    {
        local $/="</siteinfo>\x0A";

        <PAGES>;
    }
    pageloop {
        my($nstitle,$ns,$title)=splice(@_,0,3);

        for my $text ($_[0]) {
            my($stats,$lists,$sub);

            $stats=$stats{$ns};
            $lists=$lists{$ns};
            $sub=$interesting{$ns}->{subpages};
            if ($stats) {
                my(%seen);

                while ($text =~ /\[\[([^\]\|]+)/g) {
                    my($target,$final);

                    $target=$1;
                    mungtarget($target,$nstitle,$sub);
                    next if $target =~ $exp_re;
                    $final=$redir{$target};
                    $final=$target unless defined($final);
                    if ($dab{$final} && !$seen{$final}++) {
                        $linked{$final}=1;
                        $stats->{$final}++;
                        if ($lists) {
                            push(@{$lists->{$final}},$nstitle);
                        }
                    }
                }
            }
        }
    };
    print STDERR "    ".keys(%linked)." Verlinkte Begriffsklärungsseiten\n";
    foreach my $ns (sort(keys(%stats))) {
        print STDERR ("    ".keys(%{$stats{$ns}})." im Artikelnamesraum ".
                      $interesting{$ns}->{name}."\n");
    }
    print STDERR "\n";
}

sub wikilink ($ )
{
    my($target)=@_;

    if (exists($redir{$target})) {
        "[{{SERVER}}{{localurl:$target|redirect=no}} $target]";
    } elsif ($target =~ m{/\.{1,2}(?:$|/)}) {
        "[{{SERVER}}{{localurl:$target}} $target]";
    } elsif ($target =~ m{^/}) {
        "[[:$target]]";
    } else {
        "[[$target]]";
    }
}

sub report ()
{
    print STDERR "Generiere Bericht\n";

    foreach my $target (@circular) {
        $redir{$target}=$target;
    }

    while (my($ns,$stats)=each(%stats)) {
        my($filename,$cutoff)=@{$interesting{$ns}}{qw(filename cutoff)};
        my $lists=$lists{$ns};
        my @nstitles=sort {
            $stats->{$b}<=>$stats->{$a} || $a cmp $b;
        } grep {
            $stats->{$_}>=$cutoff;
        } keys(%{$stats});
        my $total=0;

        open(REPORT,'>',$filename)
            or die "Anlegen fehlgeschlagen $filename: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $nstitle (@nstitles) {
            $total+=$stats->{$nstitle};
        }
        print REPORT "Gesamtzahl der Links: $total\n";
        foreach my $nstitle (@nstitles) {
            print REPORT ("# ",wikilink($nstitle),": ",$stats->{$nstitle},
                          " [[Special:Whatlinkshere/",$nstitle,"|Links]]\n");
            if ($lists) {
                foreach my $source (sort(@{$lists->{$nstitle}})) {
                    print REPORT "#* ",wikilink($source),"\n";
                }
            }
        }
        close(REPORT);
        print STDERR "    ".@nstitles." Einträge gespeichert in $filename\n";
    }

    if (@circular) {
        @circular=sort(@circular);
        open(REPORT,'>','circular.txt')
            or die "Kann circular.txt nicht anlegen: $!";
        binmode(REPORT);
        print REPORT "\xEF\xBB\xBF";
        foreach my $target (@circular) {
            print REPORT "* ",wikilink($target),"\n";
        }
        close(REPORT);
        print STDERR "    ".@circular." Einträge gespeichert in circular.txt\n";
    } else {
        unlink('circular.txt');
    }
}

open(PAGES,'<','pages-articles.xml')
    or die "Kann pages-articles.xml nicht oeffnen: $!";
binmode(PAGES);
pass1();
seek(PAGES,0,0);
pass2();
close(PAGES);
report();

Input
Das Skript erwartet die Datei pages-articles.xml von http://dumps.wikimedia.org.
Output
Das Skript generiert zwei Textdateien namens "articles.txt" und "templates.txt" im aktuellen Verzeichnis. Der erste der beiden enthält eine Liste aller Begriffsklärungsseiten, auf die aus dem Artikelnamensraum heraus verlinkt wurde. Bei Dateien sind UTF-8-kodiert; dies ist bei der Bearbeitung mit einem Texteditor zu berücksichtigen.
Das Skript kann auch zirkuläre redirects erkennen und listet diese in der Datei "circular.txt" auf.
Diagnose
Ein erfolgreicher Durchlauf des Skripts produziert Meldungen nach folgendem Muster:
Analyse: 1. Durchgang
    41868 Begriffsklärungsseiten

Analyse: 2. Durchgang
    27527 verlinkte Begriffsklärungsseiten
    27527 im Artikelnamensraum
    0 im Vorlagennamensraum

Generiere Bericht
    27527 Einträge gespeichert in articles.txt
    0 Einträge gespeichert in templates.txt
    37 Einträge gespeichert in circular.txt
THIS WEB:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2008 (no images)

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - bcl - be - be_x_old - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - co - cr - crh - cs - csb - cu - cv - cy - da - de - diq - dsb - dv - dz - ee - el - eml - en - eo - es - et - eu - ext - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gan - gd - gl - glk - gn - got - gu - gv - ha - hak - haw - he - hi - hif - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kaa - kab - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mdf - mg - mh - mi - mk - ml - mn - mo - mr - mt - mus - my - myv - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - quality - rm - rmy - rn - ro - roa_rup - roa_tara - ru - rw - sa - sah - sc - scn - sco - sd - se - sg - sh - si - simple - sk - sl - sm - sn - so - sr - srn - ss - st - stq - su - sv - sw - szl - ta - te - tet - tg - th - ti - tk - tl - tlh - tn - to - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu -

Static Wikipedia 2007:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu

Static Wikipedia 2006:

aa - ab - af - ak - als - am - an - ang - ar - arc - as - ast - av - ay - az - ba - bar - bat_smg - be - bg - bh - bi - bm - bn - bo - bpy - br - bs - bug - bxr - ca - cbk_zam - cdo - ce - ceb - ch - cho - chr - chy - closed_zh_tw - co - cr - cs - csb - cu - cv - cy - da - de - diq - dv - dz - ee - el - eml - en - eo - es - et - eu - fa - ff - fi - fiu_vro - fj - fo - fr - frp - fur - fy - ga - gd - gl - glk - gn - got - gu - gv - ha - haw - he - hi - ho - hr - hsb - ht - hu - hy - hz - ia - id - ie - ig - ii - ik - ilo - io - is - it - iu - ja - jbo - jv - ka - kg - ki - kj - kk - kl - km - kn - ko - kr - ks - ksh - ku - kv - kw - ky - la - lad - lb - lbe - lg - li - lij - lmo - ln - lo - lt - lv - map_bms - mg - mh - mi - mk - ml - mn - mo - mr - ms - mt - mus - my - mzn - na - nah - nap - nds - nds_nl - ne - new - ng - nl - nn - no - nov - nrm - nv - ny - oc - om - or - os - pa - pag - pam - pap - pdc - pi - pih - pl - pms - ps - pt - qu - rm - rmy - rn - ro - roa_rup - roa_tara - ru - ru_sib - rw - sa - sc - scn - sco - sd - se - searchcom - sg - sh - si - simple - sk - sl - sm - sn - so - sq - sr - ss - st - su - sv - sw - ta - te - test - tet - tg - th - ti - tk - tl - tlh - tn - to - tokipona - tpi - tr - ts - tt - tum - tw - ty - udm - ug - uk - ur - uz - ve - vec - vi - vls - vo - wa - war - wo - wuu - xal - xh - yi - yo - za - zea - zh - zh_classical - zh_min_nan - zh_yue - zu