Showing posts with label Storage. Show all posts
Showing posts with label Storage. Show all posts

Thursday, March 13, 2014

Setup SAN Boot for RHEL 6.x using native multipath on EMC storage

Requirements:
1) RHEL 6.x (most apply to RHEL 5.x too, RHEL 5.x use mkinitrd instead of Dracut and the /etc/multipath.conf is slightly different refer to Red Hat KB in reference section)
2) EMC storage was setup with Active/Active (ALUA)
3) Boot LUN was presented with single path for initial install


Procedures:

1. Server boots up after initial install
2. Login to server as root to enable multipath
[root@server1]#mpathconf --enable –-with_multipathd y
3. Edit /etc/multipath.conf and make sure it only contains following valid parameters

blacklist {
}


defaults {
 user_friendly_names yes
}
devices {
  device {
    vendor "DGC"
    product ".*"
    product_blacklist "LUNZ"
    hardware_handler "1 alua"   
    path_checker directio    
    prio alua                
  }
}
4.Find out the logical path the root disk is mapped
[root@server1]#multipath –v3
It should be /dev/mapper/mpatha

5. Create initramfs with multipath module
[root@server1]#dracut --force -–add multipath

6. Make sure multipath.conf is included in initrd image
[root@server1]#lsinitrd /boot/initramfs-*.x86_64.img | grep multipath.conf
-rw-r--r--   1 root     root         2525 Feb 27 13:31 etc/multipath.conf
7. Modify the /boot/grub/device.map and change
 (hd0) /dev/sda    to
 (hd0) /dev/mapper/mpatha
This is assuming the boot disk is on /dev/mapper/mpatha as verified in step 2 above.

8. Reboot the server.

9. Verify multipath, check hwhandler='1 alua' and member disk sda for mpatha
[root@server1]#multipath –ll 
mpatha (3600601609973310067eb1e1ed69ae311) dm-0 DGC,VRAID
size=150G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
`-+- policy='round-robin 0' prio=10 status=enabled
  |- 1:0:0:0 sda 8:0   active ready running
  

10. Ask storage administrator to enable other paths for boot LUN.
11. Reboot server again after multipath is aenabled in storage too
12. Login server to verify all paths, check hwhandler='1 alua' prio>0
if hwhandler='1 emc' or prio=0 means PNR mode

[root@server1]#multipath -ll
mpatha (3600601609973310067eb1e1ed69ae311) dm-0 DGC,VRAID
size=150G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
|-+- policy='round-robin 0' prio=130 status=active
| |- 1:0:1:0 sdd 8:48  active ready running
| `- 2:0:1:0 sdj 8:144 active ready running
`-+- policy='round-robin 0' prio=10 status=enabled
  |- 1:0:0:0 sda 8:0   active ready running
  `- 2:0:0:0 sdg 8:96  active ready running
mpathb (360060160997331009fd6e124d69ae311) dm-1 DGC,VRAID
size=800G features='1 queue_if_no_path' hwhandler='1 alua' wp=rw
|-+- policy='round-robin 0' prio=130 status=active
| |- 1:0:0:1 sdb 8:16  active ready running
| `- 2:0:0:1 sdh 8:112 active ready running
`-+- policy='round-robin 0' prio=10 status=enabled
  |- 1:0:1:1 sde 8:64  active ready running
  `- 2:0:1:1 sdk 8:160 active ready running
13. Partition other LUNS using fdisk command as normal, but use logical path /dev/mapper/mpathb etc (partition will be created as /dev/mapper/mpathbp1 instead of /dev/mapper/mpathb1
NOTE: any change to /etc/multipath.conf requires re-create initramfs ( dracut --force -–add multipath) and a reboot, because the boot LUN is on SAN, if boot LUN is local disk, change to /etc/multipath.conf only requires multipathd restart

Sunday, March 24, 2013

Configure multipath on Solaris 11 for IBM V7000 SAN storage

IBM V7000 is not listed from command “mpathadm show mpath-support libmpscsi_vhci.so”, but it is still supported, Solaris 11 mpxio supports any third party storage device that is T10/T11 standards-compliant.
Procedures to setup multipath
#Setup zoning in SAN switch
#Login to V7000 management UI to map Solaris host to the volume, select host type ‘TPGS’
#Rescan new SAN disks without rebooting
$cfgadm -o force_update -c configure cX (X is the port id as shown cfgadm -al)
#verify SAN disks are detected.
$echo|format
#create scsi_vhci.conf
#scsi_vhci.conf doesn’t need to customized, scsi-vhci-failover-override parameter is optional, IBM V7000 is detected as f_tpgs with the standard probe.
$cp /kernel/drv/scsi_vhci.conf /etc/driver/drv/scsi_vhci.conf

#Obtain the device path of the fc ports of a single HBA
$ls -l /dev/cfg
lrwxrwxrwx   1 root     root          60 Feb 28 10:20 c4 -> ../../devices/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0/fp@0,0:fc
lrwxrwxrwx   1 root     root          62 Feb 28 10:20 c5 -> ../../devices/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0,1/fp@0,0:fc

the path needed is the string between ./devices/ and fp@, so the paths are  
/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0
/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0,1

#Create fp.conf
cp /kernel/drv/fp.conf /etc/driver/drv/fp.conf
#edit fp.conf to enable multipath for the two fc ports only
mpxio-disable="yes";
name="fp" parent="/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0" port=0 mpxio-disable="no";
name="fp" parent="/pci@400/pci@2/pci@0/pci@8/SUNW,qlc@0,1" port=0 mpxio-disable="no";
#run the command to enable multiple path on fc ports only, server will need to be rebooted.
$stmsboot –u –D fp

#verify 
$ echo| format will show only a single disk
$stmsboot -L
non-STMS device name                    STMS device name
------------------------------------------------------------------
/dev/rdsk/c4t50050768024046D8d0 /dev/rdsk/c0t6005076802830163A000000000000005d0
/dev/rdsk/c4t50050768022046D9d0 /dev/rdsk/c0t6005076802830163A000000000000005d0
/dev/rdsk/c5t50050768022046D8d0 /dev/rdsk/c0t6005076802830163A000000000000005d0
/dev/rdsk/c5t50050768024046D9d0 /dev/rdsk/c0t6005076802830163A000000000000005d0
$mpathadm list lu
        /dev/rdsk/c0t5000CCA0164251B8d0s2
                Total Path Count: 1
                Operational Path Count: 1
        /dev/rdsk/c0t5000CCA01642CDDCd0s2
                Total Path Count: 1
                Operational Path Count: 1
        /dev/rdsk/c0t6005076802830163A000000000000005d0s2
                Total Path Count: 4
                Operational Path Count: 4
$mpathadm show lu /dev/rdsk/c0t6005076802830163A000000000000005d0s2

#the disk is deteced as f_tpgs as shown in messages log
$grep f_ /var/adm/messages
Mar 21 13:56:46 dnmsovm1 scsi: [ID 583861 kern.info] ssd4 at scsi_vhci0: unit-address g6005076802830163a000000000000005: f_tpgs


Wednesday, September 19, 2012

Detect increased new size of existing LUN in RHEL by rescanning FC port without reboot

If the usual command :

echo “- - -“ > /sys/class/scsi_host/hostX/scan

doesn’t work for FC target, you can try this:

echo 1 > /sys/devices/{PCI-DEVICE-ID} /rescan 

The path is the device path to the FC target,which may have multiple paths. The scsi-rescan(rescan-scsi-bus.sh) tool in sg3_utils worked great for new LUN but it couldn’t detect the new size of existing LUN neither. The following procedure was tested in RHEL 6.3.



#Find the PCI ID of your device
>lspci | grep -i qlogic
15:00.0 Fibre Channel: QLogic Corp. ISP2532-based 8Gb Fibre Channel to PCI Express HBA (rev 02)
1a:00.0 Fibre Channel: QLogic Corp. ISP2532-based 8Gb Fibre Channel to PCI Express HBA (rev 02)
#Find the file rescan in /sys/devices by filtering PCI ids found above
>find /sys/devices  -name rescan  | egrep '15:00|1a:00'
/sys/devices/pci0000:00/0000:00:03.0/0000:15:00.0/rescan
/sys/devices/pci0000:00/0000:00:03.0/0000:15:00.0/host1/rport-1:0-0/target1:0:0/1:0:0:0/rescan
/sys/devices/pci0000:00/0000:00:03.0/0000:15:00.0/host1/rport-1:0-1/target1:0:1/1:0:1:0/rescan
/sys/devices/pci0000:00/0000:00:07.0/0000:1a:00.0/rescan
/sys/devices/pci0000:00/0000:00:07.0/0000:1a:00.0/host2/rport-2:0-0/target2:0:0/2:0:0:0/rescan
/sys/devices/pci0000:00/0000:00:07.0/0000:1a:00.0/host2/rport-2:0-1/target2:0:1/2:0:1:0/rescan
#kick off rescan by updating rescan file in each port
echo 1 > /sys/devices/pci0000:00/0000:00:03.0/0000:15:00.0/host1/rport-1:0-0/target1:0:0/1:0:0:0/rescan
echo 1 > /sys/devices/pci0000:00/0000:00:03.0/0000:15:00.0/host1/rport-1:0-1/target1:0:1/1:0:1:0/rescan
echo 1 > /sys/devices/pci0000:00/0000:00:07.0/0000:1a:00.0/host2/rport-2:0-0/target2:0:0/2:0:0:0/rescan
echo 1 > /sys/devices/pci0000:00/0000:00:07.0/0000:1a:00.0/host2/rport-2:0-1/target2:0:1/2:0:1:0/rescan
# messages log  file show that  the new size was detected.
>tail -f /var/log/messages
Sep 19 09:56:10 server1 kernel: sd 1:0:1:0: [sdc] 12884901888 512-byte logical blocks: (6.59 TB/6.00 TiB)
Sep 19 09:56:10 server1 kernel: sdc: detected capacity change from 5497558138880 to 6597069766656
Sep 19 10:05:57 server1 kernel: sd 1:0:0:0: [sdb] 15032385536 512-byte logical blocks: (7.69 TB/7.00 TiB)
Sep 19 10:05:57 server1 kernel: sdb: detected capacity change from 6597069766656 to 7696581394432
Sep 19 10:05:57 server1 kernel: sd 1:0:1:0: [sdc] 15032385536 512-byte logical blocks: (7.69 TB/7.00 TiB)
Sep 19 10:05:57 server1 kernel: sdc: detected capacity change from 6597069766656 to 7696581394432
Sep 19 10:05:57 server1 kernel: sd 2:0:0:0: [sdd] 15032385536 512-byte logical blocks: (7.69 TB/7.00 TiB)
Sep 19 10:05:57 server1 kernel: sdd: detected capacity change from 6597069766656 to 7696581394432
Sep 19 10:05:58 server1 kernel: sd 2:0:1:0: [sde] 15032385536 512-byte logical blocks: (7.69 TB/7.00 TiB)
Sep 19 10:05:58 server1 kernel: sde: detected capacity change from 6597069766656 to 7696581394432

Monday, September 3, 2012

Create GPT partition for LVM using parted tool

Traditional MBR(MSDOS) disk label has limitation of 2^32 (2TiB) in capacity and 15 in partition numbers(including logical partitions), while GUID Partition Table (GPT) supports 2^64 KiB (2 ZiB) and 128 partitions by default.

In Linux, fdisk doesn’t support GPT, parted is the common built-in tool for GPT.

#mpathb is the disk name is in FC SAN with multipath enabled in my test env
>parted  /dev/mapper/mpathb
(parted) mklabel gpt
(parted) mkpart primary ext4 1024kb 2tb
Warning: The resulting partition is not properly aligned for best performance.
Ignore/Cancel?
#This warning indicates the start position of the partition may not aligned with physical sector of the #hard disk. It is very important for harddisk of hardware raid, the start position must be n*stripe size.
#see also: http://honglus.blogspot.com.au/2009/08/align-partitions-on-stripe-boundary-for.html
#It may also hold true for single hard disk, because even a single harddisk has sector size of 2K,4K #nowadays .
#To fix the issue, just change the unit from SI to IEC 60027-2 standard
# k- stands for kilo, meaning 1,000 in Metric(SI) Prefix
# ki- stands for kilobinary ("kibi-"), meaning 1,024 in IEC 60027-2 standard
(parted) help unit
  unit UNIT                                set the default unit to UNIT
        UNIT is one of: s, B, kB, MB, GB, TB, compact, cyl, chs, %, kiB, MiB, GiB, TiB
(parted) mkpart primary ext4 1024KiB 8TiB
#the values are accepted without any warning
(parted) print
..
Number  Start   End     Size    File system  Name     Flags
 1      1049kB  8796GB  8796GB               primary
#1049KB is shown, because the default unit is KB, we change it to KiB
(parted) unit KiB
(parted) print
..
Number  Start    End            Size           File system  Name     Flags
 1      1024kiB  8589934592kiB  8589933568kiB               primary 
#set  LVM flag
#GPT has  reserved GUID for different partitions e.g LVM= E6D6D379-F507-44C2-A23C-238F2A3DF928
#
(parted) set 1 lvm on
(parted) p
Model: Linux device-mapper (multipath) (dm)
Disk /dev/mapper/mpathb: 19527106560kiB
Sector size (logical/physical): 512B/512B
Partition Table: gpt
Number  Start    End            Size           File system  Name     Flags
 1      1024kiB  8589934592kiB  8589933568kiB               primary  lvm
#create LVM physical volume as usual.
>pvcreate /dev/mapper/mpathb1

Tuesday, May 17, 2011

GFS (Global File System) quickstart

What is GFS?
GFS allow all nodes to have direct CONCURRENT write access to the same shared BLOCK storage.
For local file system e.g ext3, A shared BLOCK storage can be mounted in multiple nodes, but CONCURRENT write access is not allowed
For NFS, the CONCURRENT write access is allowed, but it is not direct BLOCK device, which introduce delay and another layer of failure.
GFS requirements:
- A shared block storage (iSCSI, FC SAN etc.. )
- RHCS (Red hat Cluster suite) (although GFS can be mounted in standalone server without cluster, it is primarily used for testing purpose or recovering data when cluster fails)
- RHEl 3.x onwards (RHEL derivatives: Centos/Fedora), it should work in other Linux distributions, since GFS and RHCS have been open sourced.
GFS specifications:
- RHEL 5.3 onwards use GFS2
- RHEl 5/6.1 supports maximum 16 nodes
- RHEL 5/6.1 64 bit supports maximum file system size of 100TB (8 EB in theory)
- Supports: data and metadata journaling, quota, acl, Direct I/O, growing file system online, dynamic inodes (convert inode block to data block) 
- LVM snapshot of CLVM under GFS  is NOT yet supported.
GFS components:
RHCS components: OpenAIS, CCS, fenced, CMAN and CLVMD (Clustered LVM)
GFS specific component: Distributed Lock Manager (DLM)
Install RHCS and GFS  rpms
Luci (Conga project) is the easiest way to install and configure RHCs and GFS.
#GFS specific packages:
#RHEL 5.2 or lower versions 
$yum install gfs-utils    kmod-gfs 
#RHEL 5.3 onwards, gfs2 module is part of kernel 
$yum install gfs2-utils   
Create GFS on LVM
You can create GFS on raw device, but LVM is recommended for consistent device names and the ability to extend device
#Assume you have setup and tested a working RHCS
#Edit cluster lock type in /etc/lvm/lvm.conf on ALL nodes
locking_type=3 

#Create PV/VG/LV as if in standalone system ONCE in any ONE of the nodes

#Start Cluster and clvmd on ALL nodes 
#Better use luci GUI interface to start whole cluster 
$Service cman start
$Service rgmanager start
$servcie clvmd start

#Create GFS ONCE in any ONE of the nodes
# -p lock_dlm is required in cluster mode. Lock_nolock is for standalone system
# -t cluster1:gfslv      ( Real cluster-name: arbitrary  GFS name )
# Above information is stored in GFS superblock, which can be changed with “gfs_tool sb” without re-initializing GFS e.g change lock type: "gfs_tool sb /device proto lock_nolock" 
#-j 2: the number of journals, minimum 1 for each node. The default journal size is 128Mib, can be overridden by -J
#additional journal can be added with gfs_jadd
gfs_mkfs -p lock_dlm -t cluser1:gfslv -j 2 /dev/vg01/lv01

#Mount GFS in cluster member by /etc/fstab
#put GFS mount in /etc/fstab in ALL nodes
#NOTE:
#Cluster service can mount GFS without /etc/fstab after adding GFS as resource, but It can only mount on one node (the active node).  Since GFS is supposed to be mounted on all nodes at the same time. /etc/fstab is a must, GFS resource is optional.
#GFS mount options: lockproto, locktable are optional, mount can obtain the information from superblock automatically
$cat /etc/fstab
/dev/vg01/lv01          /mnt/gfs                gfs     defaults 0 0

#Mount all GFS mounts
service gfs start
GFS command lines
####Check GFS super block 
#some values can be changed by “gfs_tool sb”
$gfs_tool sb /dev/vg01/lv01 all
sb_bsize = 4096
sb_lockproto = lock_dlm
sb_locktable = cluster1:gfslv01
.. 
####GFS tunable parameters 
#view parameters
gfs_tool gettune <mountpoint>
#set parameters 
#The parameters don’t persist after re-mount, You can customize /etc/init.d/gfs to set tunable parameters on mounting
gfs_tool settune <mountpoint>

####Performance related parameters
#like other file system, you can disable access time update by mount option “noatime”
#GFS can also allow you to control how often to update access time
$gfs_tool gettune /mnt/gfs | grep atime_quantum   
atime_quantum=3660          #in secs

#Disable quota, if not needed
#GFS2 remove the parameter and implement it in mount option “quota=off”
$gfs_tool settune /mnt/gfs quota_enforce 0

#GFS direct I/O
#Enable directI/O for database files, if DB has its own buffering mechanism to avoid “double” buffering 
$gfs_tool setflag directio /mnt/gfs/test.1     #file attribute
$gfs_tool setflag inherit_directio /mnt/gfs/db/     #DIR attribute
$gfs_tool clearflag directio /mnt/gfs/test.1              #remove attribute
$gfs_tool stat  inherit_directio /mnt/gfs/file     # view attribute

#enable data journal for very small files
#disable data journal for large files
$gfs_tool setflag inherit_jdata  /mnt/gfs/db/     #Enable  data journal (only metadata  has journal  by default) on a dir. (if operate on a file, the file must be zero size)

###GFS backup, CLVM doesn't support snapshot
$gfs_tool freeze /mnt/gfs          #change GFS to read-only (done once in any one of the nodes)
$gfs_tool unfreeze /mnt/gfs

###GFS repair 
#after unmount GFS on all nodes
$gfs_fsck  -v /dev/vg01/lv01         # gfs_fsck -v -n /dev/vg01/lv01 : -n answer no to all questions, inspect gfs only without making changes

GFS implementation scenarios:
GFS’s strength is the ability to do concurrent write to the same block device, It make it possible for Active-Active cluster nodes to write to the same block device, but there are few such cases in real life.
In Active-Active cluster nodes (all nodes perform the same task), RHCS can’t do load balancing itself, it requires external load balancer
 - Database server cluster: In theory, all nodes can write to the same DB file concurrently, However, the performance will be degraded, because all nodes try to lock the file via Distributed Lock Manager.  You can assign different task to cluster nodes to write to different DB file, e.g. node-A run DB-A and node-B run DB-B, but this can be done, without GFS, by mounting  ext3 on individual iSCSI/FC disk.
GFS doesn’t lose to ext3 in above scenario, but its lack of LVM snapshot of in GFS‘s CLVMD kills my inspiration of using DB on GFS
 - Application server cluster: e.g. Apache, Jboss server cluster. It is the true that GFS can simply application package deployment because all nodes can share the same application package binaries. But if you only use two nodes cluster, deploying application twice is not big hassle. Maintaining single copy of application binaries is convenient, but at risk of single point of failure.
 - NFS Cluster: Because NFS is I/O bound, Why would you run Active-Active NFS cluster with CPU/memory resource in nodes are not being fully utilized? 

Tuesday, May 10, 2011

LVM2: device filter and LVM metadata restore

Customize LVM device filter to get rid of the annoying “/dev/cdrom: open failed” warning
##/dev/cdrom: open failed warning
$pvcreate /dev/sdb1
/dev/cdrom: open failed: Read-only file system
$ vgcreate vg01 /dev/sdb1
/dev/cdrom: open failed: Read-only file system
##The error because LVM scan all device files by default, you can exclude some device files by device filters
##File /etc/lvm/cache/.cache contains the device file names scanned by LVM
$ cat /etc/lvm/cache/.cache
persistent_filter_cache {
valid_devices=[
"/dev/ram11",
"/dev/cdrom",
##Edit /etc/lvm/lvm.conf, Change default filter  
filter = [ "a/.*/" ]
#to
filter = [ "r|/dev/cdrom|","r|/dev/ram*|" ]
##You need to delete the cache file or ran vgscan to regenerate the file
$rm /etc/lvm/cache/.cache   OR vgscan
LVM metadata backup and restore 
LVM record every LVM VG and LV metadata operation and save it to /etc/lvm/backup automatically, old version backup files are archived to /etc/lvm/archive.
The backup file can be used to rollback LVM metadata changes, for example, if you have removed the VG/PV or even re-initialize disk with pvcreate, Don't panic,as long as file system was not re-created, you can use vgcfgrestore to restore all the data.
The following is to demonstrate how to recover a LV after it is completed destroyed from PV level (pvremove)
1.Create test LV and write some data
$pvcreate  /dev/sdb1 /dev/sdb2
Physical volume "/dev/sdb1" successfully created
Physical volume "/dev/sdb2" successfully created
$vgcreate vg01  /dev/sdb1 /dev/sdb2
Volume group "vg01" successfully created
$ lvcreate -L100M -n lv01 vg01
Logical volume "lv01" created
$ mkfs.ext3 /dev/vg01/lv01
$ mount /dev/vg01/lv01 /mnt/
$cp /etc/hosts /mnt/
$ ls /mnt/
hosts  lost+found
2.Destroy LV,VG,and PV
$vgremove vg01
Do you really want to remove volume group "vg01" containing 1 logical volumes? [y/n]: y
Do you really want to remove active logical volume lv01? [y/n]: y
Logical volume "lv01" successfully removed
Volume group "vg01" successfully removed
#VG is removed and PV was also wiped out
$ pvcreate /dev/sdb1 /dev/sdb2
Physical volume "/dev/sdb1" successfully created
Physical volume "/dev/sdb2" successfully created
3.Lets recover the LV and the data
##Find out the backup file to restore from
$vgcfgrestore -l vg01
..
file:         /etc/lvm/archive/vg01_00002.vg
VG name:      vg01
Description:  Created *before* executing 'vgremove vg01'
Backup time:  Tue May 10 15:41:31 2011
##first attempt failed, because PV UUID is changed
$ vgcfgrestore -f /etc/lvm/archive/vg01_00002.vg vg01
Couldn't find device with uuid 'pVf1J2-rAsd-eWkD-mCJc-S0pc-47zc-ImjXSB'.
Couldn't find device with uuid 'J14aVl-mbuj-k9MM-63Ad-TBAa-S0xF-VElV2W'.
Cannot restore Volume Group vg01 with 2 PVs marked as missing.
Restore failed.
##Find old UUID
$ grep -B 2 /dev/sdb /etc/lvm/archive/vg01_00002.vg
pv0 {
id = "pVf1J2-rAsd-eWkD-mCJc-S0pc-47zc-ImjXSB"
device = "/dev/sdb1"    # Hint only
--
pv1 {
id = "J14aVl-mbuj-k9MM-63Ad-TBAa-S0xF-VElV2W"
device = "/dev/sdb2"    # Hint only
$
##Recreate PV with the old UUID
$ pvcreate -u pVf1J2-rAsd-eWkD-mCJc-S0pc-47zc-ImjXSB /dev/sdb1
Physical volume "/dev/sdb1" successfully created
$ pvcreate -u J14aVl-mbuj-k9MM-63Ad-TBAa-S0xF-VElV2W  /dev/sdb2
Physical volume "/dev/sdb2" successfully created
##run vgcfgrestore again
$ vgcfgrestore -f /etc/lvm/archive/vg01_00002.vg vg01
Restored volume group vg01
##data was also recovered
$ mount /dev/vg01/lv01 /mnt/
mount: special device /dev/vg01/lv01 does not exist
$ lvchange -a y vg01/lv01
$ mount /dev/vg01/lv01 /mnt/
$ cat /mnt/hosts
127.0.0.1       localhost
..

Tuesday, April 26, 2011

Do we really need to set partition type to fd(Linux auto raid) for Linux software RAID?

Almost all Linux RAID documents mandate that partition type must be fd(Linux auto raid)  before building Linux software RAID. Actually, this step is optional, it helps a little if your RAID device is /dev/md0 in Centos.
What is fd(Linux auto raid)?
As the name implies, it is for auto detection of  raid  when OS boots. If you have created /dev/md0 but didn't put it  in configuration file /etc/mdadm.conf, OS is able to detect the partitions and assemble /dev/md0.
But, this way of assembling RAID device only works for /dev/md0 in Centos by default.
It is because Centos only enable raidautorun for /dev/md0 by default. Any other md will be assembled by reading /etc/mdadm.conf
[Centos 5 ] $grep -A 3 raidautorun  /etc/rc.sysinit 
[ -x /sbin/nash ] && echo "raidautorun /dev/md0" | nash --quiet
if [ -f /etc/mdadm.conf ]; then
/sbin/mdadm -A -s
fi
#The auto detecting behavior is logged in kernel buffer
$ dmesg | grep -i auto
md: Autodetecting RAID arrays.
md: autorun ...
md: ... autorun DONE.
fd VS  RAID superblock
Don't confuse fd with RAID superblock,  fd is an optional flag recognized by  nash raidautorun command. But RAID superblock is, in every RAID device member, an essential piece of information, which contains RAID level, state and parent  MD device UUID (man 4 md).
#Examine superblock on logical device will encounter an error
#It is expected because superblock only exist in RAID member device
 $ mdadm --examine /dev/md0
mdadm: No md superblock detected on /dev/md0.

#Examine  superblock on RAID member
$ mdadm --examine /dev/sdb2
/dev/sdb2:
          Magic : a92b4efc
        Version : 0.90.00
           UUID : a31e6699:4360a3b7:38c544fa:f4e6faa9
  Creation Time : Wed Apr 27 11:19:34 2011
     Raid Level : raid1
  Used Dev Size : 104320 (101.89 MiB 106.82 MB)
     Array Size : 104320 (101.89 MiB 106.82 MB)
   Raid Devices : 2
  Total Devices : 2
Preferred Minor : 0

    Update Time : Wed Apr 27 12:51:58 2011
          State : clean
Internal Bitmap : present
 Active Devices : 2
Working Devices : 2
 Failed Devices : 0
  Spare Devices : 0
       Checksum : 58c72673 - correct
         Events : 20

#Scan  partitions superblock to find existing raid device.
$ mdadm --examine --brief --scan --config=partitions
ARRAY /dev/md1 level=raid1 num-devices=2 UUID=da55e1e2:c781a461:73d6dfa6:8c7cf6d6
##The above output can be saved to /etc/mdadm.conf; then mdadm -A -s will activate the RAID device.
##DEVICE member list is optional, because default is “DEVICE partitions”.
Conclusion
Partition type FD is a way of assembling raid used by nash raidautorun command and it only works for /dev/md0 in Centos by default.
If you use /etc/mdadm.conf  to assemble RAID, the FD flag is optional.  But setting this flag can help you to recognize RAID members from “fdisk -l”.

Saturday, March 19, 2011

Calculate chunk size for RAID device

Chunk size is a term often used in Linux Software RAID , In hardware RAID, different vendor has different definition e.g EMC call it element size.
Chunk size is the minimum amount of data written to each member before moving to to the next. So it is only significant in Round-Robin Raid types: Raid 0/RAID5/RAID 6 ..etc. The purpose of tuning  Chunk size is to evenly distribute request to each member in RAID.
Chunk Size=avgrq-sz/ number of data disks
avgrq-sz:
The average size (in 512 Byte sectors) of the requests  that  were issued to the device.
number of data disks: Data disk only, excluding parity disk in RAID5/6
#Get avgrq-sz for a device since host is up
$iostat -x /dev/sdc
Device:         rrqm/s   wrqm/s     r/s     w/s   rsec/s   wsec/s avgrq-sz avgqu-sz   await  svctm  %util
sdc               1.96     1.68    1.33    0.54   274.67   273.13   293.56     0.02    8.61   0.90   0.17
#Size in kb
$ echo 293.56*512/1024|bc -l
146.78
#RAID 0 with 2 disks
Chunk Size(KB)=146.78/2=73.39
#Chunk size should be 2^n
Chunk Size (KB)=73.39=~64
#Create RAID 0 with chunk size=64 
 
$mdadm -C /dev/md0 –l 0 -n 2 –chunk-size 64 /dev/sda1 /dev/sdb1
#Create file system with optimal stride
#stride and chunk has the same meaning but different unit.
#Stride is on file system level, it is the number of blocks written to disk before moving to the next.
stride=chunk size / block size
#If I choose block size =4096, then stride is 64/4=16
$mkfs.ext3 -b 4096 -E stride=16 /dev/md0

Stride is irrelevant for hardware RAID, which is presented to host as single harddisk.

Friday, February 12, 2010

Improve OpenSolaris COMSTAR iSCSI Server performance.

I setup OpenSolaris iSCSI and Linux Open iSCSI in Virtualbox to compare the performace. I can get 70~80% of local FS throughput on Linux iSCSI, but it is 10~20% on OpenSolaris iSCSI.
Some tuning can boost OpenSolaris iSCSI performance to be on par with Linux iSCSI

#Environment
Server: OpenSolaris 2009.06 VS Openfiler 2.3 on Virutalbox 3.1
Client: Centos 5.2
Regardless of the iSCSI Server type, the network has to be optimized by tuning network parameters
#OpenSolaris iSCSI tunning
1. Use the bock device(/dsk) as backstore, NOT character device (/rdsk), because the block device (/dev/dsk) has buffer.

$iscsitadm create target -b /dev/zvol/dsk/data/iscsi filera.dbvol

2. Turn on fast-write-ack

$iscsitadm modify admin --fast-write-ack enable

reactivate the zpool for existing target.

$svcadm disable iscsi/target iscsitgt
$zpool export poolname ;zpool import poolname
$svcadm enable iscsi/target iscsitgt


Use the option with care, from iscsitadm man pages:
Enables or disables fast-write acknowledgment. You should enable this option only if a system is connected to the power grid through a UPS.
Otherwise, data corruption could occur if power is lost and some writes that were acknowledged have not been completely flushed to the backing store.

Monday, February 8, 2010

Setup OpenSolaris ISCSI server for Solaris and Linux Client

Setup ISCSI server in Opensolaris 2009.06 and configure Opensolaris/Centos 5.2 client to connect to ISCSI server

Simple ISCSI setup without access-list

####ISCSI Server
##enable iscsi service
svcadm enable iscsi/target iscsitgt

## Create zpool on whole disk ( XXp0 is whole disk convention for i86 arch, SPARC arch use XXs2)
zpool create data c8t1d0p0
##create volume on the zpool
zfs create -V 1G data/iscsi

##Create ISCSI target filera.dbvol, the name is arbitrary, it doesn't need to be volume name

[Server]$iscsitadm create target -b /dev/zvol/dsk/data/iscsi filera.dbvol

[Server]$iscsitadm list target -v
Target: filera.dbvol
iSCSI Name: iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol
Connections: 0
ACL list:
TPGT list:
LUN information:
LUN: 0
GUID: 0
VID: SUN
PID: SOLARIS
Type: disk
Size: 1.0G
Backing store: /dev/zvol/dsk/data/iscsi
Status: online

Command "zfs set shareiscsi=on data/iscsi " can also create target, but the target name wil be volume name

## if ISCSI server has multiple NICS, the target will be exposed on all NICS, following is to limit the target to the desired NIC
[Server]$iscsitadm create tpgt 1
[Server]$iscsitadm modify tpgt -i 172.16.1.12 1
[Server]$iscsitadm modify target -p 1 filera.dbvol

####Client: Solaris
Client-Solaris]$svcadm enable iscsi_initiator
[Client-Solaris]$iscsiadm list initiator-node
Initiator node name: iqn.1986-03.com.sun:01:946808d8ea81.4b6a1021
##Change the long name to a short friendly name
[Client-Solaris]$iscsiadm modify initiator-node -N iqn.1986-03.com.sun:01:opensolaris01
#add the server address to discover
[Client-Solaris]$iscsiadm modify discovery -t enable
[Client-Solaris]$iscsiadm add discovery-address 172.16.1.12
[Client-Solaris]$iscsiadm list target -v
#check new disk discovered
[Client-Solaris]$echo format

####Client: Linux
##assign a friendy name to client
[Client-Linux]$ cat /etc/iscsi/initiatorname.iscsi
InitiatorName=iqn.1994-05.com.redhat:centos01
[Client-Linux]$service iscsi start
[Client-Linux]$iscsiadm --mode discovery --type sendtargets --portal 172.16.1.12
##login to ISCSI discovered in previous step
[Client-Linux]$iscsiadm -m node --targetname "iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol " --portal "172.16.1.12:3260" --login
##Check the new discovered disk
[Client-Linux]$fidsk -l



ISCSI access list and CHAP authentication


Access list is a must for security reason, it is just like LUN mapping in SAN. If you need additional security, add chap-name and chap-secret, Access list alone works fine without chap initialized


##On Server, Set up CHAP information for Linux client

##Create mapping of initiator ISCSI name to an arbitrary name

[Server]$iscsitadm create initiator --iqn iqn.1994-05.com.redhat:centos01   centos01
[Server]$iscsitadm modify initiator --chap-name user_centos01 centos01
[Server]$iscsitadm modify initiator --chap-secret centos01
[Server]$iscsitadm modify target --acl centos1 filera.dbvol

##On Server, Set up CHAP information for Solaris client

[Server]$iscsitadm create initiator --iqn iqn.1986-03.com.sun:01:opensolaris01   opensolaris01
[Server]$iscsitadm modify initiator --chap-name user_opensolaris01 opensolaris01
[Server]$iscsitadm modify initiator --chap-secret opensolaris01
[Server]$iscsitadm modify target --acl opensolaris01 filera.dbvol

##On Linux Client, use the CHAP info to login

[Client-Linux]$iscsiadm -m node --targetname "iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol" --portal "172.16.1.12:3260" --op=update --name node.session.auth.authmethod --value=CHAP
[Client-Linux]$iscsiadm -m node --targetname "iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol" --portal "172.16.1.12:3260" --op=update --name node.session.auth.username --value=user_centos01
[Client-Linux]$iscsiadm -m node --targetname "iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol" --portal "172.16.1.12:3260" --op=update --name node.session.auth.password --value=Password0Password0
[Client-Linux]$iscsiadm -m node --targetname "iqn.1986-03.com.sun:02:d4b25dbf-dbc7-4f46-b2b5-c34774a32ae0.filera.dbvol" --portal "172.16.1.12:3260" --login

##On Solaris Client, use the CHAP info to login

[Client-Solaris]$ iscsiadm modify initiator-node --authentication CHAP
[Client-Solaris]$ iscsiadm modify initiator-node --CHAP-name user_opensolaris01
[Client-Solaris]$ iscsiadm modify initiator-node --CHAP-secret
Enter secret :
Re - enter secret :

Thursday, August 20, 2009

Align partitions on the stripe Boundary for Linux and Windows to boost performance

Aligning partitions on the stripe Boundary can boost IO performance up to 20% depending on file system block size , stripe size and intensity of IO workload etc. Disk alignment issue exists for environment, in which all following factors are met

Disk: Hardware Raid(Including SAN)
Server: X86 32bit or 64bit PC server .
OS: Linux or Windows ( BSD, Solaris not investigated)

Terms:
- Sector Size: Normally 512 byte as industry standard to lower-format a single harddisk.
- Stripe Size: The smallest unit used by SAN, Hardware Raid and software Raid starting from 2 KB, in power of 2. but 32,64,128 is common stripe size
- Block Size: The smallest amount of disk space which can be allocated to hold a file for file system, ext3,NTFS is 4k by default

Issue:
Due to x86 architectures BIOS limitation, the first partition starts at 63 sector by default in Windows or Linux.
As a result, The partition doesn't align the Stripe Boundary, so there are chances that one FS block sits above 2 stripes, so one request involves 2 physical IOs. The chances can be calculated as (FS block size/ stripe size).so It is 100% for 4k FS block on 4K stripe size.

The offset should be multiple of stripe size, if you are not sure the stripe size, start at 1M should be safe.
Take 64K stripe size for example:
((Partition offset) * (Disk sector size)) / (Stripe unit size)

(63 * 512) / 65536=0.4921875
(128* 512) / 65536=1
So the partition should start 128 sector (65536 bytes) at least

Verify:
- Linux: fdisk -lu
$ fdisk -lu
Device Boot Start End Blocks Id System
/dev/sda1 63 37736684 18868311 83 Linux
- Windows:
Any version up to Windows 2003 are affected by default, Windows 2008 has fixed the issue

Fix:
- Linux:
Fdisk go to expert mode by type x then select b to adjust starting block
- Windows:
diskpart detailed in the Windows KB

Reference:
http://support.microsoft.com/kb/929491
http://www.vmware.com/pdf/esx3_partition_align.pdf
http://now.netapp.com/Knowledgebase/solutionarea.asp?id=kb8190

Tuesday, May 19, 2009

Netapp Notes

#== Basics
- Introduction
Data ONTAP is the name of Netapp's Platform OS, it is based on BSD.
Netapp appliance is based on i86 PC hardware(mostly AMD Opteron nowadays).
NetApp appliance is unified storage supports file-based protocols such as NFS, CIFS, FTP, TFTP, and HTTP, block-based protocols such as FC and iSCSI.
NetApp V-Series can attach and manage third party storage systems.
NVRAM: log transitions, it can replay the log in the event of unplanned shutdown.
RAM: system memory, data read/write cache
Flash card(not all models): system initial boot media

FC Controller port 0a/0b/0c/0d ....
Jumper to setup Disk shelf ID 1 - 7
Disk numbering from right to left 16 ......
FC:
WWNN - world wide name
is a World Wide Name assigned to a port in a Fibre Channel fabric, it performs a function equivalent to the MAC address in Ethernet protocol
WWPN - world wilde port name
is valid for the same WWNN to be seen on many different ports (different addresses)

ISCSI:
Initiator= client, target = server
Software initiator with standard NIC
TCP offload Engine TOE with soft initiator – offload computing from CPU
ISCSI HBA -Hardware initiator and provide diskless boot
No need to use ISCSI HBA, if diskless boot feature is not needed and CPU resource is plenty

Netapp use software ISCSI if no ISCSI HBA present.

# setup iscsi on linux
[root@linux /]# iscsi-iname
iqn.1987-05.com.cisco:01.8494d06dc35d
The ID generated is random, write it to:/etc/initiatorname.iscsi to be persistent

- help
help # list all cmds
cifs help shares #display help for sub command
man cmd
priv set advanced # to use advanced command e.g ls
- account managment
useradmin group list
useradmin user add admin -g Administrators
- Access netapp
http://IP/na_admin # web gui URL.
ssh IP # ssh (or rsh, telnet in) for CLI access, enable ssh with secureadmin cmd
rsh IP #excute cmd remotely from admin server, the admin server should be added to /etc/hosts.equiv
get root mount of /vol/vol0/ in a unix machine to edit config files
- Read/write file
rdfile /etc/exports
wrfile -a /etc/hosts 1.1.1.1 filer1 #append to file
wrfile /etc/hosts.equiv #Rewrite file, type in lines, then ctr+C
- Error messages
rdfile /etc/messages
- backup config file
config dump config.bak
it is saved to /etc/configs/config.bak
- server setting
options command control server setting
options ftpd.enable on #enable ftp server for example
options nfs.export.auto-update off #turn off auto export, otherwise the new volume will be exported automatically
all options are saved to /etc/registry
- system stats
stats show
sysstat -su 1
- copy entire volume
vol copy
ndmpd on
ndmpcopy -f /vol/vol0 /vol/vol0_trad
- Cluster
Netapp cluster doesn't do I/O load balancing, it is just for fail-over purpose.You need to allocate disk in each node for different services


#= =Boot
Ctrl+C to go to boot menu.
there is option to reset password
type in "22/7" to show secret boot menu

#= =Storage

Qtree, and/or subdirectories, export-able
|
Volume (TradVol, FlexVol), export-able, snapshot configured at this level.
|
agregate (OnTap 7.0 and up)
|
plex (relevant mostly in mirror conf)
|
raid group
|
disk


disk zero spare # zero all spare disk so they can be added quickly to a volume
Data ONTAP supports 100 aggregates (including traditional volumes) on a single storage system.
Data ONTAP supports 500 volumes per head(FAS2020 and FAS200 series, the limit is 200 FlexVol volumes.), So in cluster enviroment, the combined volumes number in both nodes should not exceed the limit for the sake of failover.
Netapp wafl(Write Anywhere File layout) block size = 4 KB.
Netapp support Raid0, Raid4, RaidDP (double-parity), Raid 1 (via snapmirror)

- Traditional Volume
It is tightly coupled swith its containing aggregate. No other volumes can get their storage from this containing aggregate. It can'b be shrinked. can be expanded by adding more disks

- FlexVol volumes
A FlexVol volume is a volume that is loosely coupled to its containing aggregate. A FlexVol volume
can share its containing aggregate with other FlexVol volumes,Thus, a single aggregate can be the
shared source of all the storage used by all the FlexVol volumes contained by that aggregate.

- flex clone
FlexClone volumes always exist in the same aggregate as their parent volumes
You cannot delete the base Snapshot copy in a parent volume while a FlexClone volume using that
Snapshot copy exists. The base Snapshot copy is the Snapshot copy that was used to create the
FlexClone volume, and is marked busy, vclone in the parent volume.

- snapshot
255 snapshot per volume
A Snapshot copy is a frozen, read-only image of a traditional volume, a FlexVol volume, or an aggregate
that captures the state of the file system at a point in time. It doesn't consume space initially, snapshot grows only as data changes.
- Aggregate
Max space: 16TB Max Number: 100
1 big aggregate runs faster than mulitiple aggregates created on same number of physcial disks
16 disks setup is the sweetspot for space usage utilization and performance
- Lun
Lun is created on top of volume, it can't exceeds the volume size. it is used to for FC/ISCSI mount

#= = command
sysconfig -r # show raid group and spare disks
aggr status -s #only show spare disks
sysconfig -V #show aggregate name and the numer of disk owned
sysconfig -d # show physical disks
storage show disk # show physical disks

- Traditional volume
vol create travol1 3 #create a traditional volume with 3 disks, disk are selected automattically
vol create trad02 -d 0a.24 0a.25 0a.27 #create a traditional volume by specifying disk names

- Flex volume
aggr create aggr2 4 #create aggregate first
vol create flexvol1 aggr2 20M #create volume on the aggregate
vol offline trad02
vol destroy trad02 #delete volume, need to be brought offline first
vol options vol1 nosnap on #turrn off automatic shceduled snapshot, not snapshot ablity
aggr show_space #show aggregate and volume space
aggr options aggr1 raidtype raid_dp #change raidtype between raid4 and raid_dp from raid0 to raid4/raid_dp
df -h #show volume space

- Snapshot
snap create vol0 mysnap0
snap delete vol0 mysnap0
snap list vol0 #list snapshot
snap delta vol0 #show size of changed data
/vol/vol0/.snapshot/mysnap0 #access snapshot data

#= = Network
- show ip/change ip
ifconfig
- add route
route add net 100.100.100.0 192.168.211.2 1
- permanet add
wrfile -a /etc/rc route add net 100.100.100.0 192.168.211.2 1
- vi /etc/rc
routed on # turn on RIP routing
- Link aggregation
vif
- Package tracing
pktt start ns0
pktt dump ns0 #a xx.trc file will be saved to /
pktt stop ns0

#= =NFS
- turn off nfs auto export for new volume
options nfs.export.auto-update off
- Show exports
exportfs -v
- show detailed export options
exportfs -q /vol/vol0
/vol/vol0 -sec=sys,(ruleid=0),rw,anon=0,nosuid
- Permanent export and add entry to /etc/exports
exportfs -p sec=sys,rw,nosuid /vol/vol1
- Temp export and don't add entry to /etc/exports
exportfs -io sec=sys,rw,nosuid /vol/vol1
- permanent unexport, remove from /etc/exports
exportfs -z path
- Temp export
exportfs -u path
- Re-read /etc/exports and re-export
export -r
- Control access
exportfs -io sec=sys,rw=10.10.10.1 /vol/vol1
or
exportfs enable nosave 10.10.10.1 /vol/vol1

#==CIFS
- Stop/start service
cifs terminate / cifs restart
- Initial setup
cifs setup /* select (3) Windows Workgroup authentication using the filer's local user accounts */
- Determine if both nfs client/cifs client access system
options wafl.default_security_style unix ntfs mixed
- display shares
cifs shares
- add share
cifs shares -add HOME /vol/vol0/home
- add permission
cifs access -delete home everyone
cifs access HOME Administrators "Full Control"